We will finally analyse this data properly.
After discussion with Amanda, we’ll be using the strictestRace/Geography assignments, see 06_revisiting_population_terms for more context on what this means.
# First we clean up the plotting labels:
allSRAFinal <- allSRAFinal %>%
mutate(strictestRace = gsub("or ", "or\n", strictestRace)) %>%
mutate(strictestGeography = gsub("and ", "and\n", strictestGeography)) %>%
mutate(strictestGeography = gsub("^Asia$", "Asia (NOS)", strictestGeography)) %>%
mutate(strictestGeography = gsub("Subsaharan", "Sub-Saharan", strictestGeography)) %>%
mutate(hispanic = gsub("^hispanic$", "Hispanic", hispanic)) %>%
mutate(finalOrgan = gsub("^cancer$", "Cancer sample\n(NOS)", finalOrgan)) %>%
mutate(finalSystem = gsub("^cancer$", "Cancer sample\n(NOS)", finalSystem)) %>%
as.data.frame()
# For moving columns to upper case, because I forgot...
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
allSRAFinal$finalDisease <- firstup(allSRAFinal$finalDisease)
allSRAFinal$finalOrgan <- firstup(allSRAFinal$finalOrgan)
allSRAFinal$finalSystem <- firstup(allSRAFinal$finalSystem)
# Hispanic label trumps any other label
allSRAFinal$backupRace <- allSRAFinal$strictestRace
allSRAFinal$backupHispanic <- allSRAFinal$hispanic
allSRAFinal <- allSRAFinal %>% mutate(hispanic = na_if(hispanic, "non.hispanic"))
allSRAFinal$strictestRace <- coalesce(allSRAFinal$hispanic, allSRAFinal$strictestRace)
allSRAFinal$strictestRace <- factor(allSRAFinal$strictestRace)
# Now let's sort out some colour schemes... For Race we simply use Dark2 but we need to keep the mapping consistent, but for Geography we are following Alicia Martin and using a somewhat custom palette that doesn't follow anything standard, and then adding some new colours of our own; I opened a paper in Affinity and grabbed the html values and then added a couple for the new entries (Southeast Asia, Asia NOS):
allSRAFinal$strictestGeography <- factor(allSRAFinal$strictestGeography, levels = c("Sub-Saharan Africa", "North Africa and\nWestern Asia", "Europe", "South Asia", "Southeast Asia", "East Asia", "Asia (NOS)", "Oceania", "Americas", "Multiple", "Other"))
scale_fill_geography <- function(...){
ggplot2:::manual_scale('fill',
values = setNames(c('#9C8DC3', '#F3D78A', '#DB6968', '#60BC55', '#BCCC45', '#4D97CD', '#04C3C8', '#C69C3A', '#F8984E', '#8B96AD', '#FBC9C4'), levels(allSRAFinal$strictestGeography)),
)
}
scale_color_geography <- function(...){
ggplot2:::manual_scale('color',
values = setNames(c('#9C8DC3', '#F3D78A', '#DB6968', '#60BC55', '#BCCC45', '#4D97CD', '#04C3C8', '#C69C3A', '#F8984E', '#8B96AD', '#FBC9C4'), levels(allSRAFinal$strictestGeography)),
)
}
raceColours <- brewer.pal(8,"Set2") #8 because we don't plot NAs
scale_color_race <- function(...){
ggplot2:::manual_scale('color',
values = setNames(raceColours, levels(allSRAFinal$strictestRace)),
)
}
scale_fill_race <- function(...){
ggplot2:::manual_scale('fill',
values = setNames(raceColours, levels(allSRAFinal$strictestRace)),
)
}
wberCols <- c(viridis::turbo(n = 7))
scale_fill_wber <- function(...){
ggplot2:::manual_scale('fill',
values = setNames(wberCols, c("East Asia &\nPacific", "Europe &\nCentral Asia", "Latin America &\nCaribbean", "Middle East &\nNorth Africa", "North America", "South Asia", "Sub-Saharan Africa")), na.value = "grey50")
}
# We also define a couple of plot layouts:
long2Design <- "
12
12
"
wide2Design <- "
11
22
"
theme_set(theme_bw(base_size = 6))
theme_update(axis.text=element_text(size=7))
theme_update(legend.key.size = unit(12, 'pt'), #change legend key size
legend.title = element_text(size=8), #change legend title font size
legend.text = element_text(size=6),
plot.title = element_text(size=8))
theme_update(plot.margin = unit(c(0, 0, 0, 0), "pt"))
We’re also going to use some additional info from the World Bank, so let’s add those columns in now:
worldBank <- read.csv("World_Bank_Descriptors.csv")
# Some names need to be fixed, but others are more flexible? I'd rather have USA than United States.
allSRAFinal$finalCountry <- gsub("^UK$", "United Kingdom", allSRAFinal$finalCountry)
allSRAFinal$finalCountry <- gsub("^Korea$", "South Korea", allSRAFinal$finalCountry)
allSRAFinal$worldBank <- worldBank[match(allSRAFinal$finalCountry, worldBank$Economy),]$Income.group
# allSRAFinal %>% count(finalCountry, worldBank)
# Missing countries are all high income, so we can set them to that manually now that we're happy with the names:
allSRAFinal <- allSRAFinal %>%
mutate(worldBank = if_else(is.na(worldBank), "High income", worldBank)) %>%
mutate(worldBank = if_else(is.na(finalCountry), NA, worldBank ))
# And now the regions they define:
allSRAFinal$worldRegion <- worldBank[match(allSRAFinal$finalCountry, worldBank$Economy),]$Region
# allSRAFinal %>% count(finalCountry, worldRegion)
# Once again we gotta add some manually...
allSRAFinal <- allSRAFinal %>%
mutate(worldRegion = if_else(grepl("Russia|Slovakia", finalCountry), "Europe & Central Asia", worldRegion)) %>%
mutate(worldRegion = if_else(grepl("South Korea|Taiwan", finalCountry), "East Asia & Pacific", worldRegion)) %>%
mutate(worldRegion = if_else(grepl("USA", finalCountry), "North America", worldRegion)) %>%
as.data.frame()
# And now we need to clean up some labels so we can plot them better. Good thing is, it's just an ampersand:
allSRAFinal$worldRegion <- gsub("& ", "&\n", allSRAFinal$worldRegion)
geographyProp <- allSRAFinal %>% drop_na(strictestGeography) %>% count(strictestGeography) %>% mutate(freq = n/sum(n))
raceProp <- allSRAFinal %>% drop_na(strictestRace) %>% count(strictestRace) %>% mutate(freq = n/sum(n))
geoPlot <- ggplot(geographyProp, aes(x = fct_inorder(strictestGeography), y = n, fill = strictestGeography)) +
geom_bar(stat="identity") +
ggtitle("Submitted descriptor\n(ancestral/geographic origin)") +
xlab("") +
ylab("Samples") +
coord_flip() +
scale_fill_geography() +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
theme(legend.position="none")
geographyProp
## strictestGeography n freq
## 1 Sub-Saharan Africa 1694 0.172417303
## 2 North Africa and\nWestern Asia 18 0.001832061
## 3 Europe 5469 0.556641221
## 4 South Asia 718 0.073078880
## 5 Southeast Asia 49 0.004987277
## 6 East Asia 1121 0.114096692
## 7 Asia (NOS) 310 0.031552163
## 8 Oceania 11 0.001119593
## 9 Americas 166 0.016895674
## 10 Multiple 207 0.021068702
## 11 Other 62 0.006310433
sum(geographyProp$n)
## [1] 9825
racePlot <- ggplot(raceProp, aes(x = fct_inorder(strictestRace), y = n, fill=strictestRace)) +
geom_bar(stat="identity") +
ggtitle("Submitted descriptor\n(US Census term)") +
xlab("") +
ylab("Samples") +
coord_flip() +
scale_fill_race() +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
theme(legend.position="none")
raceProp
## strictestRace n freq
## 1 American Indian and\nAlaska Native 43 0.0030594095
## 2 Asian 944 0.0671647101
## 3 Black or\nAfrican American 1742 0.1239416578
## 4 Hispanic 1255 0.0892920669
## 5 Multiple 203 0.0144432586
## 6 Native Hawaiian and\nother Pacific Islander 6 0.0004268943
## 7 Other 162 0.0115261473
## 8 White 9700 0.6901458556
sum(raceProp$n)
## [1] 14055
geoPlot + racePlot +
plot_layout(design = long2Design)
ggsave("fig1_overall.pdf")
## Saving 7 x 5 in image
# And now with log transform on the axis:
geoPlot <- geoPlot +
scale_y_continuous(trans='log10')
racePlot <- racePlot +
scale_y_continuous(trans='log10')
geoPlot + racePlot +
plot_layout(design = long2Design)
ggsave("fig1_overall_log10.pdf")
## Saving 7 x 5 in image
# There's 132 samples that do not have any info besides "Not Hispanic", which I think we should treat as NA, and therefore drop. The other option is to change them to race: Other and then move on with our lives.
allSRAFinal %>% filter(is.na(allSRAFinal$strictestGeography) & is.na(allSRAFinal$strictestRace)) %>% count(ETHNICITY, RACE)
## ETHNICITY RACE n
## 1 American indian or alaskan native <NA> 14
## 2 Non-Hispanic <NA> 36
## 3 Not Hispanic or Latino Unknown or Not Reported 4
## 4 Not Hispanic or Latino <NA> 92
## 5 <NA> American Indian or Alaskan Native 2
allSRAFinal <- allSRAFinal %>% filter(!is.na(allSRAFinal$strictestGeography) | !is.na(allSRAFinal$strictestRace))
nrow(allSRAFinal[!is.na(allSRAFinal$strictestGeography) | !is.na(allSRAFinal$strictestRace),])
## [1] 23880
dim(allSRAFinal)
## [1] 23880 49
All samples accounted for now, at last!
So let’s get a bit more granular, showing n samples per study:
## Using SRA.Study, strictestGeography as id variables
## Using SRA.Study, strictestRace as id variables
Let’s get some meaningful statistics on this:
First we separate studies on the basis of whether they contain racial or geographic descriptors, but the hispanic usage introduces a new complexity… There’s a bunch of studies that now straddle the two categories. Could shift them to race like we did before, but better to just leave them alone for now? I am overthinking.
geographyClean <- allSRAFinal %>% drop_na(strictestGeography)
raceClean <- allSRAFinal %>% drop_na(strictestRace)
geoSRA <- unique(geographyClean$SRA.Study)
length(geoSRA)
## [1] 135
raceSRA <- unique(raceClean$SRA.Study)
length(raceSRA)
## [1] 139
length(intersect(geoSRA, raceSRA))
## [1] 13
allSRAFinal %>% filter(grepl(paste(intersect(geoSRA, raceSRA), collapse="|"), SRA.Study)) %>%
group_by(SRA.Study) %>%
summarise(size = n(), studyJoint = n_distinct(SRA.Study))
## # A tibble: 13 Ă— 3
## SRA.Study size studyJoint
## <chr> <int> <int>
## 1 DRP001797 42 1
## 2 SRP070663 12 1
## 3 SRP086245 42 1
## 4 SRP108559 32 1
## 5 SRP172694 6 1
## 6 SRP216947 36 1
## 7 SRP221484 195 1
## 8 SRP245400 345 1
## 9 SRP274641 6 1
## 10 SRP283115 17 1
## 11 SRP303641 100 1
## 12 SRP374111 100 1
## 13 SRP388678 492 1
allSRAFinal %>% filter(grepl(paste(intersect(geoSRA, raceSRA), collapse="|"), SRA.Study)) %>% nrow()
## [1] 1425
# Diversity by study:
geographyClean %>% group_by(SRA.Study) %>%
summarise(size = n(), studyGeo = n_distinct(strictestGeography)) %>%
summarise(n = size, meanSize = mean(size), meanGeo = mean(studyGeo), maxSize = max(size), maxGeo = max(studyGeo), sdSize = sd(size), sdGeo = sd(studyGeo))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 135 Ă— 7
## n meanSize meanGeo maxSize maxGeo sdSize sdGeo
## <int> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 40 72.8 1.67 753 6 128. 1.15
## 2 464 72.8 1.67 753 6 128. 1.15
## 3 332 72.8 1.67 753 6 128. 1.15
## 4 51 72.8 1.67 753 6 128. 1.15
## 5 133 72.8 1.67 753 6 128. 1.15
## 6 66 72.8 1.67 753 6 128. 1.15
## 7 20 72.8 1.67 753 6 128. 1.15
## 8 159 72.8 1.67 753 6 128. 1.15
## 9 44 72.8 1.67 753 6 128. 1.15
## 10 181 72.8 1.67 753 6 128. 1.15
## # ℹ 125 more rows
raceClean %>% group_by(SRA.Study) %>%
summarise(size = n(), studyRace = n_distinct(strictestRace)) %>%
summarise(n = size, meanSize = mean(size), meanRace = mean(studyRace), maxSize = max(size), maxRace = max(studyRace), sdSize = sd(size), sdRace = sd(studyRace))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 139 Ă— 7
## n meanSize meanRace maxSize maxRace sdSize sdRace
## <int> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 2 101. 2.26 1857 8 213. 1.44
## 2 36 101. 2.26 1857 8 213. 1.44
## 3 13 101. 2.26 1857 8 213. 1.44
## 4 20 101. 2.26 1857 8 213. 1.44
## 5 95 101. 2.26 1857 8 213. 1.44
## 6 37 101. 2.26 1857 8 213. 1.44
## 7 9 101. 2.26 1857 8 213. 1.44
## 8 10 101. 2.26 1857 8 213. 1.44
## 9 12 101. 2.26 1857 8 213. 1.44
## 10 113 101. 2.26 1857 8 213. 1.44
## # ℹ 129 more rows
# By descriptor across all studies
geographyClean %>% group_by(SRA.Study) %>% count(strictestGeography) %>% group_by(strictestGeography) %>%
summarise(size = sum(n), studyGeo = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n))
## # A tibble: 11 Ă— 6
## strictestGeography size studyGeo mean max sd
## <fct> <int> <int> <dbl> <int> <dbl>
## 1 "Sub-Saharan Africa" 1694 42 40.3 355 70.7
## 2 "North Africa and\nWestern Asia" 18 7 2.57 9 2.94
## 3 "Europe" 5469 87 62.9 753 109.
## 4 "South Asia" 718 13 55.2 365 102.
## 5 "Southeast Asia" 49 2 24.5 48 33.2
## 6 "East Asia" 1121 25 44.8 208 57.5
## 7 "Asia (NOS)" 310 25 12.4 66 16.6
## 8 "Oceania" 11 5 2.2 3 0.447
## 9 "Americas" 166 9 18.4 39 14.3
## 10 "Multiple" 207 5 41.4 153 63.5
## 11 "Other" 62 6 10.3 41 15.4
raceClean %>% group_by(SRA.Study) %>% count(strictestRace) %>% group_by(strictestRace) %>%
summarise(size = sum(n), studyRace = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n))
## # A tibble: 8 Ă— 6
## strictestRace size studyRace mean max sd
## <fct> <int> <int> <dbl> <int> <dbl>
## 1 "American Indian and\nAlaska Native" 43 10 4.3 18 5.38
## 2 "Asian" 944 47 20.1 164 29.9
## 3 "Black or\nAfrican American" 1742 74 23.5 177 30.9
## 4 "Hispanic" 1255 50 25.1 562 79.8
## 5 "Multiple" 203 13 15.6 49 16.5
## 6 "Native Hawaiian and\nother Pacific Islander" 6 3 2 3 1
## 7 "Other" 162 11 14.7 64 22.3
## 8 "White" 9700 106 91.5 1005 162.
There are many ways to slice this, but some are harder to see than others, so here is the final best attempt…
## Using finalCountry, strictestGeography, worldRegion as id variables
## Warning: The `legend.title.align` argument of `theme()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use theme(legend.title = element_text(hjust)) instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Using finalCountry, strictestRace, worldRegion as id variables
## Saving 7 x 5 in image
## Using worldRegion, strictestGeography as id variables
## Using worldRegion, strictestRace as id variables
Some nice statistics about all of this:
# By study by world region:
geographyClean %>% group_by(worldRegion, SRA.Study) %>%
summarise(size = n(), studyGeo = n_distinct(strictestGeography)) %>%
summarise(n = n(), meanSize = mean(size), meanGeo = mean(studyGeo), maxSize = max(size), maxGeo = max(studyGeo), sdSize = sd(size), sdGeo = sd(studyGeo))
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
## # A tibble: 6 Ă— 8
## worldRegion n meanSize meanGeo maxSize maxGeo sdSize sdGeo
## <chr> <int> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 "East Asia &\nPacific" 32 44.6 1.31 238 5 59.2 0.931
## 2 "Europe &\nCentral Asia" 40 85.9 1.4 748 4 141. 0.810
## 3 "Latin America &\nCaribbean" 4 21 1 27 1 5.48 0
## 4 "North America" 54 89.1 2.17 753 6 152. 1.37
## 5 "South Asia" 2 13 2 14 3 1.41 1.41
## 6 <NA> 3 13.3 1 18 1 5.03 0
raceClean %>% group_by(worldRegion, SRA.Study) %>%
summarise(size = n(), studyRace = n_distinct(strictestRace)) %>%
summarise(n = n(), meanSize = mean(size), meanRace = mean(studyRace), maxSize = max(size), maxRace = max(studyRace), sdSize = sd(size), sdRace = sd(studyRace))
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
## # A tibble: 4 Ă— 8
## worldRegion n meanSize meanRace maxSize maxRace sdSize sdRace
## <chr> <int> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 "East Asia &\nPacific" 14 27.6 1.36 166 4 40.9 0.842
## 2 "Europe &\nCentral Asia" 14 180 2.43 772 5 319. 1.45
## 3 "Middle East &\nNorth Africa" 1 11 1 11 1 NA NA
## 4 "North America" 110 101. 2.36 1857 8 209. 1.47
# By descriptor by economic region
geographyClean %>% group_by(SRA.Study, worldRegion) %>% count(strictestGeography) %>% group_by(worldRegion, strictestGeography) %>%
summarise(size = sum(n), studyGeo = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n)) %>%
as.data.frame()
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
## worldRegion strictestGeography size studyGeo mean max sd
## 1 East Asia &\nPacific Sub-Saharan Africa 9 2 4.500000 8 4.9497475
## 2 East Asia &\nPacific North Africa and\nWestern Asia 3 1 3.000000 3 NA
## 3 East Asia &\nPacific Europe 191 8 23.875000 61 21.1351941
## 4 East Asia &\nPacific South Asia 60 3 20.000000 44 21.6333077
## 5 East Asia &\nPacific Southeast Asia 49 2 24.500000 48 33.2340187
## 6 East Asia &\nPacific East Asia 1016 22 46.181818 208 60.6360229
## 7 East Asia &\nPacific Asia (NOS) 100 4 25.000000 39 13.4907376
## 8 Europe &\nCentral Asia Sub-Saharan Africa 516 10 51.600000 129 42.8205558
## 9 Europe &\nCentral Asia Europe 2051 32 64.093750 375 91.6058737
## 10 Europe &\nCentral Asia South Asia 611 5 122.200000 365 148.2588952
## 11 Europe &\nCentral Asia East Asia 40 1 40.000000 40 NA
## 12 Europe &\nCentral Asia Asia (NOS) 23 3 7.666667 18 9.0737717
## 13 Europe &\nCentral Asia Americas 40 2 20.000000 39 26.8700577
## 14 Europe &\nCentral Asia Multiple 153 1 153.000000 153 NA
## 15 Europe &\nCentral Asia Other 3 2 1.500000 2 0.7071068
## 16 Latin America &\nCaribbean Americas 84 4 21.000000 27 5.4772256
## 17 North America Sub-Saharan Africa 1169 30 38.966667 355 79.7541409
## 18 North America North Africa and\nWestern Asia 15 6 2.500000 9 3.2093613
## 19 North America Europe 3186 43 74.093023 753 131.8102558
## 20 North America South Asia 23 3 7.666667 14 5.5075705
## 21 North America East Asia 65 2 32.500000 63 43.1335137
## 22 North America Asia (NOS) 186 17 10.941176 66 17.7463749
## 23 North America Oceania 11 5 2.200000 3 0.4472136
## 24 North America Americas 42 3 14.000000 36 19.0787840
## 25 North America Multiple 54 4 13.500000 26 13.8684294
## 26 North America Other 59 4 14.750000 41 17.8021534
## 27 South Asia Europe 1 1 1.000000 1 NA
## 28 South Asia South Asia 24 2 12.000000 12 0.0000000
## 29 South Asia Asia (NOS) 1 1 1.000000 1 NA
## 30 <NA> Europe 40 3 13.333333 18 5.0332230
raceClean %>% group_by(SRA.Study, worldRegion) %>% count(strictestRace) %>% group_by(worldRegion, strictestRace) %>%
summarise(size = sum(n), studyRace = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n)) %>%
as.data.frame()
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
## worldRegion strictestRace size studyRace mean max sd
## 1 East Asia &\nPacific American Indian and\nAlaska Native 3 1 3.00000 3 NA
## 2 East Asia &\nPacific Asian 338 11 30.72727 164 44.884498
## 3 East Asia &\nPacific Black or\nAfrican American 2 1 2.00000 2 NA
## 4 East Asia &\nPacific Hispanic 14 2 7.00000 12 7.071068
## 5 East Asia &\nPacific White 29 4 7.25000 13 5.123475
## 6 Europe &\nCentral Asia American Indian and\nAlaska Native 3 1 3.00000 3 NA
## 7 Europe &\nCentral Asia Asian 194 4 48.50000 96 54.848276
## 8 Europe &\nCentral Asia Black or\nAfrican American 175 9 19.44444 80 25.884895
## 9 Europe &\nCentral Asia Hispanic 167 5 33.40000 96 37.825917
## 10 Europe &\nCentral Asia Other 142 3 47.33333 64 17.009801
## 11 Europe &\nCentral Asia White 1839 12 153.25000 630 244.541547
## 12 Middle East &\nNorth Africa White 11 1 11.00000 11 NA
## 13 North America American Indian and\nAlaska Native 37 8 4.62500 18 6.045955
## 14 North America Asian 412 32 12.87500 55 13.689530
## 15 North America Black or\nAfrican American 1565 64 24.45312 177 31.828873
## 16 North America Hispanic 1074 43 24.97674 562 85.245161
## 17 North America Multiple 203 13 15.61538 49 16.545586
## 18 North America Native Hawaiian and\nother Pacific Islander 6 3 2.00000 3 1.000000
## 19 North America Other 20 8 2.50000 5 1.309307
## 20 North America White 7821 89 87.87640 1005 151.443495
allSRAFinal %>% filter(!is.na(worldBank)) %>% count(worldBank) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## worldBank n freq
## 1 High income 23266 0.975922819
## 2 Upper middle income 548 0.022986577
## 3 Lower middle income 26 0.001090604
allSRAFinal %>% filter(!is.na(finalCountry)) %>% count(finalCountry) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## finalCountry n freq
## 1 USA 14804 0.6209731544
## 2 Belgium 2413 0.1012164430
## 3 United Kingdom 2018 0.0846476510
## 4 Canada 1144 0.0479865772
## 5 Sweden 540 0.0226510067
## 6 China 464 0.0194630872
## 7 Singapore 452 0.0189597315
## 8 South Korea 411 0.0172399329
## 9 Switzerland 278 0.0116610738
## 10 Taiwan 222 0.0093120805
## 11 Russia 166 0.0069630872
## 12 Japan 139 0.0058305369
## 13 Australia 126 0.0052852349
## 14 Denmark 124 0.0052013423
## 15 Italy 99 0.0041526846
## 16 Spain 91 0.0038171141
## 17 Germany 78 0.0032718121
## 18 Brazil 57 0.0023909396
## 19 Hungary 54 0.0022651007
## 20 Poland 34 0.0014261745
## 21 Greece 28 0.0011744966
## 22 Mexico 27 0.0011325503
## 23 India 26 0.0010906040
## 24 Slovakia 23 0.0009647651
## 25 Israel 11 0.0004614094
## 26 Austria 8 0.0003355705
## 27 France 3 0.0001258389
allSRAFinal %>% filter(!is.na(strictestGeography)) %>% count(worldBank) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## worldBank n freq
## 1 High income 9332 0.949821883
## 2 Upper middle income 427 0.043460560
## 3 <NA> 40 0.004071247
## 4 Lower middle income 26 0.002646310
allSRAFinal %>% filter(!is.na(strictestRace)) %>% count(worldBank) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## worldBank n freq
## 1 High income 13934 0.991390964
## 2 Upper middle income 121 0.008609036
allSRAFinal %>% filter(!is.na(strictestGeography)) %>% count(finalCountry) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## finalCountry n freq
## 1 USA 3738 0.3804580153
## 2 United Kingdom 1951 0.1985750636
## 3 Canada 1072 0.1091094148
## 4 Sweden 540 0.0549618321
## 5 South Korea 411 0.0418320611
## 6 China 343 0.0349109415
## 7 Singapore 286 0.0291094148
## 8 Switzerland 278 0.0282951654
## 9 Taiwan 222 0.0225954198
## 10 Australia 126 0.0128244275
## 11 Denmark 124 0.0126208651
## 12 Russia 118 0.0120101781
## 13 Italy 87 0.0088549618
## 14 Spain 79 0.0080407125
## 15 Germany 72 0.0073282443
## 16 Brazil 57 0.0058015267
## 17 Belgium 56 0.0056997455
## 18 Hungary 54 0.0054961832
## 19 Japan 40 0.0040712468
## 20 <NA> 40 0.0040712468
## 21 Greece 28 0.0028498728
## 22 Mexico 27 0.0027480916
## 23 India 26 0.0026463104
## 24 Poland 24 0.0024427481
## 25 Slovakia 23 0.0023409669
## 26 France 3 0.0003053435
allSRAFinal %>% filter(!is.na(strictestRace)) %>% count(finalCountry) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## finalCountry n freq
## 1 USA 11066 0.7873354678
## 2 Belgium 2357 0.1676983280
## 3 Singapore 166 0.0118107435
## 4 China 121 0.0086090359
## 5 Japan 99 0.0070437567
## 6 Canada 72 0.0051227321
## 7 United Kingdom 67 0.0047669868
## 8 Russia 48 0.0034151547
## 9 Italy 12 0.0008537887
## 10 Spain 12 0.0008537887
## 11 Israel 11 0.0007826396
## 12 Poland 10 0.0007114906
## 13 Austria 8 0.0005691925
## 14 Germany 6 0.0004268943
allSRAFinal %>% filter(!is.na(strictestGeography)) %>% count(worldRegion) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## worldRegion n freq
## 1 North America 4810 0.489567430
## 2 Europe &\nCentral Asia 3437 0.349821883
## 3 East Asia &\nPacific 1428 0.145343511
## 4 Latin America &\nCaribbean 84 0.008549618
## 5 <NA> 40 0.004071247
## 6 South Asia 26 0.002646310
allSRAFinal %>% filter(!is.na(strictestRace)) %>% count(worldRegion) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## worldRegion n freq
## 1 North America 11138 0.7924581999
## 2 Europe &\nCentral Asia 2520 0.1792956243
## 3 East Asia &\nPacific 386 0.0274635361
## 4 Middle East &\nNorth Africa 11 0.0007826396
allSRAFinal %>% count(worldRegion) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
## worldRegion n freq
## 1 North America 15948 0.6678391960
## 2 Europe &\nCentral Asia 5957 0.2494556114
## 3 East Asia &\nPacific 1814 0.0759631491
## 4 Latin America &\nCaribbean 84 0.0035175879
## 5 <NA> 40 0.0016750419
## 6 South Asia 26 0.0010887772
## 7 Middle East &\nNorth Africa 11 0.0004606365
# And finally... are studies from certain regions bigger than others?
NAEurSizes <- allSRAFinal %>% filter(worldRegion == "North America" | worldRegion == "Europe") %>%
count(SRA.Study)
restWorldSizes <- allSRAFinal %>% filter(worldRegion != "North America" & worldRegion != "Europe") %>%
count(SRA.Study)
t.test(restWorldSizes$n, NAEurSizes$n)
##
## Welch Two Sample t-test
##
## data: restWorldSizes$n and NAEurSizes$n
## t = -1.3275, df = 251.9, p-value = 0.1855
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -72.20593 14.05916
## sample estimates:
## mean of x mean of y
## 75.1619 104.2353
allStudySizes <- allSRAFinal %>% group_by(SRA.Study) %>%
summarise(sampleSize = n(), region = worldRegion) %>%
distinct()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'SRA.Study'. You can override using the `.groups` argument.
lm(sampleSize ~ region, data = allStudySizes)
##
## Call:
## lm(formula = sampleSize ~ region, data = allStudySizes)
##
## Coefficients:
## (Intercept) regionEurope &\nCentral Asia regionLatin America &\nCaribbean regionMiddle East &\nNorth Africa regionNorth America
## 40.31 72.09 -19.31 -29.31 63.92
## regionSouth Asia
## -27.31
anova(lm(sampleSize ~ region, data = allStudySizes))
## Analysis of Variance Table
##
## Response: sampleSize
## Df Sum Sq Mean Sq F value Pr(>F)
## region 5 204346 40869 1.2516 0.2855
## Residuals 252 8228480 32653
# Note that there's an issue here - although we filtered for studies with more than 10 entries, not all of those will have 10 samples with descriptors, which explains why these numbers are smaller. So long as we are consistent across the two datasets, it should be ok.
Maybe worth considering some alluvial plots? At the single country level they’re very messy, however, so what about at the World Bank region level?
allSRAFinal %>% count(strictestGeography, worldRegion) %>% drop_na(c(strictestGeography, worldRegion)) %>%
ggplot(data = .,
aes(axis1 = worldRegion, axis2 = strictestGeography, y = n)) +
scale_x_discrete(limits = c("SRA Depositor\nRegion", "Population\nDescriptor"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = strictestGeography)) +
scale_fill_geography(name="Population\nDescriptor") +
geom_stratum(width=1/3) +
ylab("Samples") +
geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
theme_minimal(base_size = 6) +
theme(legend.position = "none")
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
allSRAFinal %>% count(strictestRace, worldRegion) %>% drop_na(c(strictestRace, worldRegion)) %>%
ggplot(data = .,
aes(axis1 = worldRegion, axis2 = strictestRace, y = n)) +
scale_x_discrete(limits = c("SRA Depositor\nRegion", "US Census Racial Term"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = strictestRace)) +
scale_fill_race(name="US Census\nTerm)") +
geom_stratum(width=1/3) +
ylab("Samples") +
# geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
theme_minimal(base_size = 6) +
theme(legend.position = "none")
### And finally... does it have any kind of description??
allSRAFinal <- allSRAFinal %>%
mutate(hasDescriptor = if_else(is.na(strictestRace), "Geographic", "Racial"))
# Easiest to see with a boxplot?
descriptorUsePlot <- ggplot(allSRAFinal, aes(x = finalCountry, fill = hasDescriptor)) +
geom_bar(position = "fill") +
ggtitle("") +
xlab("SRA depositor country") +
ylab("Proportion of samples") +
coord_fixed(ratio=6) +
guides(fill=guide_legend(title="Descriptor\ntype")) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
descriptorUsePlot
ggsave("fig2_descriptor_use.pdf")
## Saving 7 x 5 in image
descriptorUseRegionPlot <- ggplot(allSRAFinal, aes(x = worldRegion, fill = hasDescriptor)) +
geom_bar(position = "fill") +
ggtitle("") +
xlab("SRA depositor region") +
ylab("Proportion of samples") +
guides(fill=guide_legend(title="Descriptor\ntype")) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
descriptorUseRegionPlot
ggsave("fig2_descriptor_use_region_plot.pdf")
## Saving 7 x 5 in image
descriptorUseRegionAlluvial <- allSRAFinal %>% count(hasDescriptor, worldRegion) %>% drop_na(c(hasDescriptor, worldRegion)) %>%
ggplot(data = .,
aes(axis1 = worldRegion, axis2 = hasDescriptor, y = n)) +
scale_x_discrete(limits = c("SRA Depositor\nRegion", "Population\nDescriptor"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = hasDescriptor)) +
geom_stratum(width=1/3) +
ylab("Samples") +
geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
theme_minimal(base_size = 6) +
theme(legend.position = "none")
descriptorUseRegionAlluvial
ggsave("fig2_descriptor_use_region_alluvial.pdf")
## Saving 7 x 5 in image
And now, some statistics from the figures above:
allSRAFinal %>% count(finalCountry, hasDescriptor) %>%
group_by(finalCountry) %>%
mutate(percent = 100 * n/sum(n)) %>%
ungroup %>%
as.data.frame()
## finalCountry hasDescriptor n percent
## 1 Australia Geographic 126 100.000000
## 2 Austria Racial 8 100.000000
## 3 Belgium Geographic 56 2.320763
## 4 Belgium Racial 2357 97.679237
## 5 Brazil Geographic 57 100.000000
## 6 Canada Geographic 1072 93.706294
## 7 Canada Racial 72 6.293706
## 8 China Geographic 343 73.922414
## 9 China Racial 121 26.077586
## 10 Denmark Geographic 124 100.000000
## 11 France Geographic 3 100.000000
## 12 Germany Geographic 72 92.307692
## 13 Germany Racial 6 7.692308
## 14 Greece Geographic 28 100.000000
## 15 Hungary Geographic 54 100.000000
## 16 India Geographic 26 100.000000
## 17 Israel Racial 11 100.000000
## 18 Italy Geographic 87 87.878788
## 19 Italy Racial 12 12.121212
## 20 Japan Geographic 40 28.776978
## 21 Japan Racial 99 71.223022
## 22 Mexico Geographic 27 100.000000
## 23 Poland Geographic 24 70.588235
## 24 Poland Racial 10 29.411765
## 25 Russia Geographic 118 71.084337
## 26 Russia Racial 48 28.915663
## 27 Singapore Geographic 286 63.274336
## 28 Singapore Racial 166 36.725664
## 29 Slovakia Geographic 23 100.000000
## 30 South Korea Geographic 411 100.000000
## 31 Spain Geographic 79 86.813187
## 32 Spain Racial 12 13.186813
## 33 Sweden Geographic 540 100.000000
## 34 Switzerland Geographic 278 100.000000
## 35 Taiwan Geographic 222 100.000000
## 36 USA Geographic 3738 25.249932
## 37 USA Racial 11066 74.750068
## 38 United Kingdom Geographic 1951 96.679881
## 39 United Kingdom Racial 67 3.320119
## 40 <NA> Geographic 40 100.000000
allSRAFinal %>% filter(!grepl('USA', finalCountry)) %>%
count(hasDescriptor) %>%
mutate(freq = n/sum(n))
## hasDescriptor n freq
## 1 Geographic 6087 0.6706699
## 2 Racial 2989 0.3293301
allSRAFinal %>% filter(grepl('USA', finalCountry)) %>%
count(hasDescriptor) %>%
mutate(freq = n/sum(n))
## hasDescriptor n freq
## 1 Geographic 3738 0.2524993
## 2 Racial 11066 0.7475007
descriptorChi <- full_join((allSRAFinal %>% filter(!grepl('USA', finalCountry)) %>%
count(hasDescriptor) %>%
mutate(freq = n/sum(n))),
(allSRAFinal %>% filter(grepl('USA', finalCountry)) %>%
count(hasDescriptor) %>%
mutate(freq = n/sum(n))), by = "hasDescriptor", suffix = c("noUSA", "USA"))
chisq.test(descriptorChi[,c(2,4)])
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: descriptorChi[, c(2, 4)]
## X-squared = 4061.3, df = 1, p-value < 2.2e-16
allSRAFinal %>% count(hasDescriptor) %>%
mutate(freq = n/sum(n))
## hasDescriptor n freq
## 1 Geographic 9825 0.4114322
## 2 Racial 14055 0.5885678
# We also want to know the mean sample size of studies in the USA vs those not in the USA, for funsies
usaSizes <- allSRAFinal %>% filter(grepl('USA', finalCountry)) %>%
group_by(SRA.Study) %>%
summarise(n = n())
mean(usaSizes$n)
## [1] 103.5245
nousaSizes <- allSRAFinal %>% filter(!grepl('USA', finalCountry)) %>%
group_by(SRA.Study) %>%
summarise(n = n())
mean(nousaSizes$n)
## [1] 76.91525
t.test(usaSizes$n, nousaSizes$n)
##
## Welch Two Sample t-test
##
## data: usaSizes$n and nousaSizes$n
## t = 1.2117, df = 258.91, p-value = 0.2267
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -16.63314 69.85159
## sample estimates:
## mean of x mean of y
## 103.52448 76.91525
As above, we start by looking simply at the country/region where sequencing is happening. A quick straw poll suggested finalOrgan was more interpretable than finalSystem, so I’m sticking with organ. Will have to work out how to do plots of disease and tissue with all the missing values, but I think we’re getting closer
# First we focus on population descriptors:
geographySummary <- allSRAFinal %>% count(strictestGeography, finalCountry, worldRegion, finalOrgan, finalDisease)
raceSummary <- allSRAFinal %>% count(strictestRace, finalCountry, worldRegion, finalOrgan, finalDisease)
geoOrgan <- geographySummary %>% group_by(finalOrgan, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestGeography)) %>% as.data.frame %>%
# geoOrgan <- meltGeography %>% drop_na(c(finalOrgan, strictestGeography)) %>%
ggplot(., aes(x = fct_rev(finalOrgan), y = value, fill = strictestGeography)) +
geom_bar(stat="identity") +
xlab("") +
ylab("Samples") +
coord_flip() +
scale_fill_geography(name="Population\nDescriptor") +
guides(fill=guide_legend(title="", label.position="left", ncol=1)) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
raceOrgan <- raceSummary %>% group_by(finalOrgan, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestRace)) %>% as.data.frame %>%
# raceOrgan <- meltRace %>% drop_na(c(finalOrgan, strictestRace)) %>%
ggplot(., aes(x = fct_rev(finalOrgan), y = value, fill = strictestRace)) +
geom_bar(stat="identity") +
xlab("Sampled tissue") +
ylab("Samples") +
coord_flip() +
scale_fill_race(name="US Census\nTerm)") +
guides(fill=guide_legend(title="", label.position="left")) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
geoOrgan + raceOrgan +
plot_layout(design=long2Design) +
plot_annotation(tag_levels = 'A')
ggsave("fig3_organ_by_descriptor.pdf", height=8, width=6)
ggsave("fig3_organ_by_descriptor.png", height=8, width=6)
# And now filtering to only those tissues with more than 50 observations
geoOrganSlim <- geographySummary %>% group_by(finalOrgan, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestGeography)) %>% group_by(finalOrgan) %>% filter(sum(value) > 99) %>% as.data.frame %>%
# geoOrgan <- meltGeography %>% drop_na(c(finalOrgan, strictestGeography)) %>%
ggplot(., aes(x = finalOrgan, y = value, fill = strictestGeography)) +
geom_bar(stat="identity") +
xlab(NULL) +
ylab("Samples") +
# coord_flip() +
scale_fill_geography(name="Population\nDescriptor") +
guides(fill=guide_legend(title="", label.position="left", ncol=3)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
raceOrganSlim <- raceSummary %>% group_by(finalOrgan, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestRace)) %>% group_by(finalOrgan) %>% filter(sum(value) > 99) %>% as.data.frame %>%
# raceOrgan <- meltRace %>% drop_na(c(finalOrgan, strictestRace)) %>%
ggplot(., aes(x = finalOrgan, y = value, fill = strictestRace)) +
geom_bar(stat="identity") +
xlab(NULL) +
ylab("Samples") +
# coord_flip() +
scale_fill_race(name="US Census\nTerm)") +
guides(fill=guide_legend(title="", label.position="left", ncol=2)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
geoOrganSlim + raceOrganSlim +
plot_layout(design=wide2Design) +
plot_annotation(tag_levels = 'A')
ggsave("fig3_organ_by_descriptor_slim.pdf", width = 7, height = 4.5)
ggsave("fig3_organ_by_descriptor_slim.png", width = 7, height = 4.5)
# Some statistics: The percentage of samples that each tissue accounts for, and how many descriptors are associated:
allSRAFinal %>% count(hasDescriptor, finalOrgan) %>% group_by(hasDescriptor) %>% summarise(finalOrgan = finalOrgan, n = n, proportion = n/sum(n)) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'hasDescriptor'. You can override using the `.groups` argument.
## hasDescriptor finalOrgan n proportion
## 1 Geographic Adipose 54 5.496183e-03
## 2 Geographic Bladder 10 1.017812e-03
## 3 Geographic Blastoderm 22 2.239186e-03
## 4 Geographic Blood 5076 5.166412e-01
## 5 Geographic Blood vessel 131 1.333333e-02
## 6 Geographic Bone 4 4.071247e-04
## 7 Geographic Bone marrow 102 1.038168e-02
## 8 Geographic Brain 612 6.229008e-02
## 9 Geographic Breast 35 3.562341e-03
## 10 Geographic CNS 15 1.526718e-03
## 11 Geographic Cancer sample\n(NOS) 17 1.730280e-03
## 12 Geographic Heart 195 1.984733e-02
## 13 Geographic IPSC 901 9.170483e-02
## 14 Geographic Intestine 866 8.814249e-02
## 15 Geographic Joint 92 9.363868e-03
## 16 Geographic Kidney 24 2.442748e-03
## 17 Geographic Liver 99 1.007634e-02
## 18 Geographic Lung 224 2.279898e-02
## 19 Geographic Morula 41 4.173028e-03
## 20 Geographic Muscle 806 8.203562e-02
## 21 Geographic Nose 43 4.376590e-03
## 22 Geographic Pituitary gland 7 7.124682e-04
## 23 Geographic Placenta 32 3.256997e-03
## 24 Geographic Prostate 126 1.282443e-02
## 25 Geographic Skin 174 1.770992e-02
## 26 Geographic Testis 15 1.526718e-03
## 27 Geographic Thyroid 34 3.460560e-03
## 28 Geographic Trachea 12 1.221374e-03
## 29 Geographic <NA> 56 5.699746e-03
## 30 Racial Adipose 53 3.770900e-03
## 31 Racial Adrenal gland 3 2.134472e-04
## 32 Racial Bladder 2 1.422981e-04
## 33 Racial Blood 8955 6.371398e-01
## 34 Racial Blood vessel 58 4.126645e-03
## 35 Racial Bone marrow 52 3.699751e-03
## 36 Racial Brain 1084 7.712558e-02
## 37 Racial Breast 466 3.315546e-02
## 38 Racial Cancer sample\n(NOS) 720 5.122732e-02
## 39 Racial Cartilage 3 2.134472e-04
## 40 Racial Digestive tract 2 1.422981e-04
## 41 Racial Eye 47 3.344006e-03
## 42 Racial Heart 579 4.119530e-02
## 43 Racial IPSC 720 5.122732e-02
## 44 Racial Intestine 170 1.209534e-02
## 45 Racial Kidney 3 2.134472e-04
## 46 Racial Larynx 1 7.114906e-05
## 47 Racial Liver 208 1.479900e-02
## 48 Racial Lung 110 7.826396e-03
## 49 Racial Lymph node 22 1.565279e-03
## 50 Racial Muscle 16 1.138385e-03
## 51 Racial Nose 71 5.051583e-03
## 52 Racial Oral cavity 83 5.905372e-03
## 53 Racial Ovary 222 1.579509e-02
## 54 Racial PNS 12 8.537887e-04
## 55 Racial Pancreas 2 1.422981e-04
## 56 Racial Prostate 81 5.763074e-03
## 57 Racial Skin 101 7.186055e-03
## 58 Racial Spleen 3 2.134472e-04
## 59 Racial Stomach 45 3.201708e-03
## 60 Racial Testis 1 7.114906e-05
## 61 Racial Thymus 1 7.114906e-05
## 62 Racial Thyroid 1 7.114906e-05
## 63 Racial Tonsil 6 4.268943e-04
## 64 Racial Urinary tract 52 3.699751e-03
## 65 Racial Uterus 50 3.557453e-03
## 66 Racial Vagina 16 1.138385e-03
## 67 Racial <NA> 34 2.419068e-03
allSRAFinal %>% count(finalOrgan, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(finalOrgan) %>% summarise(geoGroups = length(finalOrgan)) %>% arrange(desc(geoGroups)) %>% as.data.frame
## finalOrgan geoGroups
## 1 Blood 10
## 2 Brain 7
## 3 Intestine 6
## 4 Joint 6
## 5 Heart 5
## 6 IPSC 5
## 7 Blood vessel 4
## 8 Lung 4
## 9 Skin 4
## 10 Bone marrow 3
## 11 Cancer sample\n(NOS) 3
## 12 Kidney 3
## 13 Liver 3
## 14 Muscle 3
## 15 Testis 3
## 16 CNS 2
## 17 Pituitary gland 2
## 18 Prostate 2
## 19 Thyroid 2
## 20 Adipose 1
## 21 Bladder 1
## 22 Blastoderm 1
## 23 Bone 1
## 24 Breast 1
## 25 Morula 1
## 26 Nose 1
## 27 Placenta 1
## 28 Trachea 1
## 29 <NA> 1
allSRAFinal %>% count(finalOrgan, strictestRace) %>% drop_na(strictestRace) %>% group_by(finalOrgan) %>% summarise(raceGroups = length(finalOrgan)) %>% arrange(desc(raceGroups)) %>% as.data.frame
## finalOrgan raceGroups
## 1 Blood 8
## 2 Cancer sample\n(NOS) 7
## 3 Nose 6
## 4 Blood vessel 5
## 5 Breast 5
## 6 Heart 5
## 7 Intestine 5
## 8 Ovary 5
## 9 Bone marrow 4
## 10 IPSC 4
## 11 Lung 4
## 12 Oral cavity 4
## 13 Skin 4
## 14 Adipose 3
## 15 Brain 3
## 16 Muscle 3
## 17 Stomach 3
## 18 Urinary tract 3
## 19 Uterus 3
## 20 <NA> 3
## 21 Bladder 2
## 22 Cartilage 2
## 23 Liver 2
## 24 Lymph node 2
## 25 PNS 2
## 26 Prostate 2
## 27 Spleen 2
## 28 Tonsil 2
## 29 Vagina 2
## 30 Adrenal gland 1
## 31 Digestive tract 1
## 32 Eye 1
## 33 Kidney 1
## 34 Larynx 1
## 35 Pancreas 1
## 36 Testis 1
## 37 Thymus 1
## 38 Thyroid 1
# And how many tissues is each descriptor associated with?
allSRAFinal %>% drop_na(strictestGeography, finalOrgan) %>% count(finalOrgan)
## finalOrgan n
## 1 Adipose 54
## 2 Bladder 10
## 3 Blastoderm 22
## 4 Blood 5076
## 5 Blood vessel 131
## 6 Bone 4
## 7 Bone marrow 102
## 8 Brain 612
## 9 Breast 35
## 10 CNS 15
## 11 Cancer sample\n(NOS) 17
## 12 Heart 195
## 13 IPSC 901
## 14 Intestine 866
## 15 Joint 92
## 16 Kidney 24
## 17 Liver 99
## 18 Lung 224
## 19 Morula 41
## 20 Muscle 806
## 21 Nose 43
## 22 Pituitary gland 7
## 23 Placenta 32
## 24 Prostate 126
## 25 Skin 174
## 26 Testis 15
## 27 Thyroid 34
## 28 Trachea 12
allSRAFinal %>% drop_na(strictestRace, finalOrgan) %>% count(finalOrgan)
## finalOrgan n
## 1 Adipose 53
## 2 Adrenal gland 3
## 3 Bladder 2
## 4 Blood 8955
## 5 Blood vessel 58
## 6 Bone marrow 52
## 7 Brain 1084
## 8 Breast 466
## 9 Cancer sample\n(NOS) 720
## 10 Cartilage 3
## 11 Digestive tract 2
## 12 Eye 47
## 13 Heart 579
## 14 IPSC 720
## 15 Intestine 170
## 16 Kidney 3
## 17 Larynx 1
## 18 Liver 208
## 19 Lung 110
## 20 Lymph node 22
## 21 Muscle 16
## 22 Nose 71
## 23 Oral cavity 83
## 24 Ovary 222
## 25 PNS 12
## 26 Pancreas 2
## 27 Prostate 81
## 28 Skin 101
## 29 Spleen 3
## 30 Stomach 45
## 31 Testis 1
## 32 Thymus 1
## 33 Thyroid 1
## 34 Tonsil 6
## 35 Urinary tract 52
## 36 Uterus 50
## 37 Vagina 16
23/28
## [1] 0.8214286
36/37
## [1] 0.972973
allSRAFinal %>% count(finalOrgan, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(strictestGeography) %>% summarise(geoGroups = length(strictestGeography)) %>% arrange(desc(geoGroups)) %>% as.data.frame
## strictestGeography geoGroups
## 1 Europe 23
## 2 Sub-Saharan Africa 16
## 3 East Asia 13
## 4 Asia (NOS) 12
## 5 South Asia 6
## 6 Americas 6
## 7 North Africa and\nWestern Asia 4
## 8 Other 3
## 9 Multiple 2
## 10 Southeast Asia 1
## 11 Oceania 1
allSRAFinal %>% count(finalOrgan, strictestRace) %>% drop_na(strictestRace) %>% group_by(strictestRace) %>% summarise(raceGroups = length(strictestRace)) %>% arrange(desc(raceGroups)) %>% as.data.frame
## strictestRace raceGroups
## 1 White 36
## 2 Black or\nAfrican American 25
## 3 Hispanic 16
## 4 Asian 14
## 5 Multiple 12
## 6 American Indian and\nAlaska Native 6
## 7 Other 4
## 8 Native Hawaiian and\nother Pacific Islander 1
# And now some alluvial plots, for funsies and supplementary data:
allSRAFinal %>% drop_na(strictestGeography) %>% count(worldRegion)
## worldRegion n
## 1 East Asia &\nPacific 1428
## 2 Europe &\nCentral Asia 3437
## 3 Latin America &\nCaribbean 84
## 4 North America 4810
## 5 South Asia 26
## 6 <NA> 40
allSRAFinal %>% drop_na(strictestRace) %>% count(worldRegion)
## worldRegion n
## 1 East Asia &\nPacific 386
## 2 Europe &\nCentral Asia 2520
## 3 Middle East &\nNorth Africa 11
## 4 North America 11138
geoTissueFacet <- geographySummary %>% group_by(finalOrgan, strictestGeography, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestGeography, worldRegion)) %>% as.data.frame %>%
ggplot(., aes(x = finalOrgan, y = value, fill = strictestGeography)) +
geom_bar(stat="identity") +
ggtitle("Samples with geographic/ancestry labels deposited in:") +
xlab("Sampled tissue") +
ylab("Samples") +
scale_fill_geography(name="Population\nDescriptor") +
guides(fill=guide_legend(title="", nrow=2)) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
theme(strip.background = element_blank()) +
facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalOrgan', 'strictestGeography'. You can override using the `.groups` argument.
geoTissueFacet
ggsave("fig3_organ_by_geography_faceted.pdf", width=6, height=9)
ggsave("fig3_organ_by_geography_faceted.png", width=6, height=9)
raceTissueFacet <- raceSummary %>% group_by(finalOrgan, strictestRace, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestRace, worldRegion)) %>% as.data.frame %>%
ggplot(., aes(x = finalOrgan, y = value, fill = strictestRace)) +
geom_bar(stat="identity") +
ggtitle("Samples with US Census labels deposited in:") +
xlab("Sampled tissue") +
ylab("Samples") +
scale_fill_race(name="US Census\nTerm)") +
guides(fill=guide_legend(title="")) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
theme(strip.background = element_blank()) +
facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalOrgan', 'strictestRace'. You can override using the `.groups` argument.
raceTissueFacet
ggsave("fig3_organ_by_race_faceted.pdf", width=6, height=9)
ggsave("fig3_organ_by_race_faceted.png", width=6, height=9)
And now we should ask, since these proportions look different, are there differences? Is having a given descriptor associated with higher likelihood or something or something else being sequenced?
We can do this at the hasDescriptor level (racial vs geographic), at the finalCountry level and at the worldRegion level, although we also might want to consider the actual descriptors
But first, an easier question: Is blood really more diverse than anything else?
bloodPropGeo <- allSRAFinal %>% filter(grepl("Blood", finalOrgan) & !is.na(strictestGeography)) %>%
count(strictestGeography) %>%
mutate(freq = n/sum(n))
nobloodPropGeo <- allSRAFinal %>% filter(!grepl("Blood", finalOrgan) & !is.na(strictestGeography)) %>%
count(strictestGeography) %>%
mutate(freq = n/sum(n))
bloodPropRace <- allSRAFinal %>% filter(grepl("Blood", finalOrgan) & !is.na(strictestRace)) %>%
count(strictestRace) %>%
mutate(freq = n/sum(n))
nobloodPropRace <- allSRAFinal %>% filter(!grepl("Blood", finalOrgan) & !is.na(strictestRace)) %>%
count(strictestRace) %>%
mutate(freq = n/sum(n))
geoBlood <- full_join(bloodPropGeo, nobloodPropGeo, by="strictestGeography", suffix=c("Blood", "NoBlood")) %>%
full_join(., geographyProp, by="strictestGeography") %>%
mutate(across(where(is.numeric), ~replace(., is.na(.), 0)))
raceBlood <- full_join(bloodPropRace, nobloodPropRace, by="strictestRace", suffix=c("Blood", "NoBlood")) %>%
full_join(., raceProp, by="strictestRace") %>%
mutate(across(where(is.numeric), ~replace(., is.na(.), 0)))
chisq.test(geoBlood[,c(2,6)])
## Warning in chisq.test(geoBlood[, c(2, 6)]): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: geoBlood[, c(2, 6)]
## X-squared = 509.04, df = 10, p-value < 2.2e-16
chisq.test(raceBlood[,c(2,6)])
## Warning in chisq.test(raceBlood[, c(2, 6)]): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: raceBlood[, c(2, 6)]
## X-squared = 114.6, df = 7, p-value < 2.2e-16
# Both are significant but blood is clearly a lot more diverse
geoBlood
## strictestGeography nBlood freqBlood nNoBlood freqNoBlood n freq
## 1 Sub-Saharan Africa 1402 0.269252929 292 0.063230836 1694 0.172417303
## 2 North Africa and\nWestern Asia 12 0.002304590 6 0.001299264 18 0.001832061
## 3 Europe 2194 0.421355867 3275 0.709181464 5469 0.556641221
## 4 South Asia 693 0.133090071 25 0.005413599 718 0.073078880
## 5 Southeast Asia 49 0.009410409 0 0.000000000 49 0.004987277
## 6 East Asia 474 0.091031304 647 0.140103941 1121 0.114096692
## 7 Asia (NOS) 109 0.020933359 201 0.043525336 310 0.031552163
## 8 Americas 39 0.007489917 127 0.027501083 166 0.016895674
## 9 Multiple 180 0.034568850 27 0.005846687 207 0.021068702
## 10 Other 55 0.010562704 7 0.001515808 62 0.006310433
## 11 Oceania 0 0.000000000 11 0.002381984 11 0.001119593
raceBlood
## strictestRace nBlood freqBlood nNoBlood freqNoBlood n freq
## 1 American Indian and\nAlaska Native 34 0.0037723289 9 0.001785006 43 0.0030594095
## 2 Asian 536 0.0594696549 408 0.080920270 944 0.0671647101
## 3 Black or\nAfrican American 849 0.0941972706 893 0.177112257 1742 0.1239416578
## 4 Hispanic 1056 0.1171640963 199 0.039468465 1255 0.0892920669
## 5 Multiple 178 0.0197492511 25 0.004958350 203 0.0144432586
## 6 Native Hawaiian and\nother Pacific Islander 6 0.0006657051 0 0.000000000 6 0.0004268943
## 7 Other 155 0.0171973816 7 0.001388338 162 0.0115261473
## 8 White 6199 0.6877843115 3501 0.694367315 9700 0.6901458556
allPropGeo <- allSRAFinal %>% filter(!is.na(worldRegion)) %>%
group_by(worldRegion) %>%
count(finalOrgan) %>%
as.data.frame()
# Not very promising...
anova(lm(n ~ finalOrgan + worldRegion, data=allPropGeo))
## Analysis of Variance Table
##
## Response: n
## Df Sum Sq Mean Sq F value Pr(>F)
## finalOrgan 44 45284988 1029204 0.9390 0.5823
## worldRegion 5 15606168 3121234 2.8476 0.0303 *
## Residuals 33 36170641 1096080
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
allPropGeo <- allPropGeo %>%
cast(., worldRegion ~ finalOrgan) %>%
mutate(across(where(is.numeric), ~replace(., is.na(.), 0)))
## Using n as value column. Use the value argument to cast to override this choice
# Boring, uninterpretable, cannot be bothered to go any further. Keep it descriptive
chisq.test(allPropGeo[,2:ncol(allPropGeo)])
## Warning in chisq.test(allPropGeo[, 2:ncol(allPropGeo)]): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: allPropGeo[, 2:ncol(allPropGeo)]
## X-squared = 19965, df = 225, p-value < 2.2e-16
geoDisease <- geographySummary %>% group_by(finalDisease, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestGeography)) %>% as.data.frame %>%
# geoDisease <- meltGeography %>% drop_na(c(finalDisease, strictestGeography)) %>%
ggplot(., aes(x = fct_rev(finalDisease), y = value, fill = strictestGeography)) +
geom_bar(stat="identity") +
xlab("") +
ylab("Samples") +
coord_flip() +
scale_fill_geography(name="Population\nDescriptor") +
guides(fill=guide_legend(title="", label.position="left", ncol=1)) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
raceDisease <- raceSummary %>% group_by(finalDisease, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestRace)) %>% as.data.frame %>%
# raceDisease <- meltRace %>% drop_na(c(finalDisease, strictestRace)) %>%
ggplot(., aes(x = fct_rev(finalDisease), y = value, fill = strictestRace)) +
geom_bar(stat="identity") +
xlab("Sampled tissue") +
ylab("Samples") +
coord_flip() +
scale_fill_race(name="US Census\nTerm)") +
guides(fill=guide_legend(title="", label.position="left")) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
geoDisease + raceDisease +
plot_layout(design=long2Design) +
plot_annotation(tag_levels = 'A')
ggsave("fig4_disease_by_descriptor.pdf", height=8, width=6)
ggsave("fig4_disease_by_descriptor.png", height=8, width=6)
# And now filtering to only those tissues with more than 50 observations
geoDiseaseSlim <- geographySummary %>% group_by(finalDisease, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestGeography)) %>% group_by(finalDisease) %>% filter(sum(value) > 9) %>% as.data.frame %>%
# geoDisease <- meltGeography %>% drop_na(c(finalDisease, strictestGeography)) %>%
ggplot(., aes(x = finalDisease, y = value, fill = strictestGeography)) +
geom_bar(stat="identity") +
xlab(NULL) +
ylab("Samples") +
# coord_flip() +
scale_fill_geography(name="Population\nDescriptor") +
guides(fill=guide_legend(title="", label.position="left", ncol=3)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
raceDiseaseSlim <- raceSummary %>% group_by(finalDisease, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestRace)) %>% group_by(finalDisease) %>% filter(sum(value) > 9) %>% as.data.frame %>%
# raceDisease <- meltRace %>% drop_na(c(finalDisease, strictestRace)) %>%
ggplot(., aes(x = finalDisease, y = value, fill = strictestRace)) +
geom_bar(stat="identity") +
xlab(NULL) +
ylab("Samples") +
# coord_flip() +
scale_fill_race(name="US Census\nTerm)") +
guides(fill=guide_legend(title="", label.position="left", ncol=2)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
geoDiseaseSlim + raceDiseaseSlim +
plot_layout(design=wide2Design) +
plot_annotation(tag_levels = 'A')
ggsave("fig4_disease_by_descriptor_slim.pdf", width = 7, height = 4.5)
ggsave("fig4_disease_by_descriptor_slim.png", width = 7, height = 4.5)
# Some statistics: The percentage of samples that each tissue accounts for, and how many descriptors are associated:
allSRAFinal %>% count(hasDescriptor, finalDisease) %>% group_by(hasDescriptor) %>% summarise(finalDisease = finalDisease, n = n, proportion = n/sum(n)) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'hasDescriptor'. You can override using the `.groups` argument.
## hasDescriptor finalDisease n proportion
## 1 Geographic Acute trauma 12 0.0012213740
## 2 Geographic Autoimmune 674 0.0686005089
## 3 Geographic Blood 24 0.0024427481
## 4 Geographic Cancer 551 0.0560814249
## 5 Geographic Cardiovascular 44 0.0044783715
## 6 Geographic Endocrine 10 0.0010178117
## 7 Geographic Gastrointestinal 7 0.0007124682
## 8 Geographic Genetic syndrome 57 0.0058015267
## 9 Geographic Healthy control 1134 0.1154198473
## 10 Geographic Infectious 276 0.0280916031
## 11 Geographic Integumentary 7 0.0007124682
## 12 Geographic Kidney 17 0.0017302799
## 13 Geographic Mental health 28 0.0028498728
## 14 Geographic Metabolic 7 0.0007124682
## 15 Geographic Neurodegenerative 72 0.0073282443
## 16 Geographic Neurological 6 0.0006106870
## 17 Geographic Other 7 0.0007124682
## 18 Geographic Reproductive 10 0.0010178117
## 19 Geographic Respiratory 107 0.0108905852
## 20 Geographic <NA> 6775 0.6895674300
## 21 Racial Autoimmune 309 0.0219850587
## 22 Racial Blood 5 0.0003557453
## 23 Racial Cancer 981 0.0697972252
## 24 Racial Cardiovascular 13 0.0009249377
## 25 Racial Healthy control 930 0.0661686233
## 26 Racial Infectious 61 0.0043400925
## 27 Racial Mental health 437 0.0310921380
## 28 Racial Metabolic 227 0.0161508360
## 29 Racial Neurodegenerative 111 0.0078975454
## 30 Racial Neurological 8 0.0005691925
## 31 Racial Other 2 0.0001422981
## 32 Racial Reproductive 7 0.0004980434
## 33 Racial Respiratory 20 0.0014229811
## 34 Racial <NA> 10944 0.7786552828
allSRAFinal %>% count(worldRegion, finalDisease) %>% group_by(worldRegion) %>% summarise(finalDisease = finalDisease, n = n, proportion = n/sum(n)) %>% arrange(finalDisease) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
## worldRegion finalDisease n proportion
## 1 East Asia &\nPacific Acute trauma 12 6.615215e-03
## 2 East Asia &\nPacific Autoimmune 98 5.402426e-02
## 3 Europe &\nCentral Asia Autoimmune 32 5.371831e-03
## 4 Latin America &\nCaribbean Autoimmune 1 1.190476e-02
## 5 North America Autoimmune 852 5.342363e-02
## 6 East Asia &\nPacific Blood 27 1.488423e-02
## 7 Latin America &\nCaribbean Blood 1 1.190476e-02
## 8 North America Blood 1 6.270379e-05
## 9 East Asia &\nPacific Cancer 380 2.094818e-01
## 10 Europe &\nCentral Asia Cancer 208 3.491690e-02
## 11 Latin America &\nCaribbean Cancer 23 2.738095e-01
## 12 North America Cancer 907 5.687234e-02
## 13 <NA> Cancer 14 3.500000e-01
## 14 East Asia &\nPacific Cardiovascular 27 1.488423e-02
## 15 Europe &\nCentral Asia Cardiovascular 15 2.518046e-03
## 16 North America Cardiovascular 13 8.151492e-04
## 17 South Asia Cardiovascular 2 7.692308e-02
## 18 East Asia &\nPacific Endocrine 10 5.512679e-03
## 19 East Asia &\nPacific Gastrointestinal 5 2.756340e-03
## 20 North America Gastrointestinal 2 1.254076e-04
## 21 Europe &\nCentral Asia Genetic syndrome 45 7.554138e-03
## 22 North America Genetic syndrome 12 7.524454e-04
## 23 East Asia &\nPacific Healthy control 122 6.725469e-02
## 24 Europe &\nCentral Asia Healthy control 834 1.400034e-01
## 25 Latin America &\nCaribbean Healthy control 27 3.214286e-01
## 26 North America Healthy control 1061 6.652872e-02
## 27 South Asia Healthy control 2 7.692308e-02
## 28 <NA> Healthy control 18 4.500000e-01
## 29 Europe &\nCentral Asia Infectious 208 3.491690e-02
## 30 Latin America &\nCaribbean Infectious 2 2.380952e-02
## 31 North America Infectious 127 7.963381e-03
## 32 East Asia &\nPacific Integumentary 7 3.858875e-03
## 33 East Asia &\nPacific Kidney 17 9.371555e-03
## 34 East Asia &\nPacific Mental health 8 4.410143e-03
## 35 North America Mental health 457 2.865563e-02
## 36 Europe &\nCentral Asia Metabolic 16 2.685916e-03
## 37 North America Metabolic 218 1.366943e-02
## 38 East Asia &\nPacific Neurodegenerative 20 1.102536e-02
## 39 Europe &\nCentral Asia Neurodegenerative 19 3.189525e-03
## 40 Latin America &\nCaribbean Neurodegenerative 15 1.785714e-01
## 41 North America Neurodegenerative 129 8.088789e-03
## 42 East Asia &\nPacific Neurological 6 3.307607e-03
## 43 North America Neurological 8 5.016303e-04
## 44 East Asia &\nPacific Other 9 4.961411e-03
## 45 East Asia &\nPacific Reproductive 7 3.858875e-03
## 46 South Asia Reproductive 10 3.846154e-01
## 47 East Asia &\nPacific Respiratory 25 1.378170e-02
## 48 Europe &\nCentral Asia Respiratory 48 8.057747e-03
## 49 North America Respiratory 54 3.386005e-03
## 50 East Asia &\nPacific <NA> 1034 5.700110e-01
## 51 Europe &\nCentral Asia <NA> 4532 7.607856e-01
## 52 Latin America &\nCaribbean <NA> 15 1.785714e-01
## 53 Middle East &\nNorth Africa <NA> 11 1.000000e+00
## 54 North America <NA> 12107 7.591548e-01
## 55 South Asia <NA> 12 4.615385e-01
## 56 <NA> <NA> 8 2.000000e-01
allSRAFinal %>% count(worldRegion, finalDisease) %>% group_by(finalDisease) %>% summarise(worldRegion = worldRegion, n = n, proportion = n/sum(n)) %>% arrange(finalDisease) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
## finalDisease worldRegion n proportion
## 1 Acute trauma East Asia &\nPacific 12 1.0000000000
## 2 Autoimmune East Asia &\nPacific 98 0.0996948118
## 3 Autoimmune Europe &\nCentral Asia 32 0.0325534079
## 4 Autoimmune Latin America &\nCaribbean 1 0.0010172940
## 5 Autoimmune North America 852 0.8667344863
## 6 Blood East Asia &\nPacific 27 0.9310344828
## 7 Blood Latin America &\nCaribbean 1 0.0344827586
## 8 Blood North America 1 0.0344827586
## 9 Cancer East Asia &\nPacific 380 0.2480417755
## 10 Cancer Europe &\nCentral Asia 208 0.1357702350
## 11 Cancer Latin America &\nCaribbean 23 0.0150130548
## 12 Cancer North America 907 0.5920365535
## 13 Cancer <NA> 14 0.0091383812
## 14 Cardiovascular East Asia &\nPacific 27 0.4736842105
## 15 Cardiovascular Europe &\nCentral Asia 15 0.2631578947
## 16 Cardiovascular North America 13 0.2280701754
## 17 Cardiovascular South Asia 2 0.0350877193
## 18 Endocrine East Asia &\nPacific 10 1.0000000000
## 19 Gastrointestinal East Asia &\nPacific 5 0.7142857143
## 20 Gastrointestinal North America 2 0.2857142857
## 21 Genetic syndrome Europe &\nCentral Asia 45 0.7894736842
## 22 Genetic syndrome North America 12 0.2105263158
## 23 Healthy control East Asia &\nPacific 122 0.0591085271
## 24 Healthy control Europe &\nCentral Asia 834 0.4040697674
## 25 Healthy control Latin America &\nCaribbean 27 0.0130813953
## 26 Healthy control North America 1061 0.5140503876
## 27 Healthy control South Asia 2 0.0009689922
## 28 Healthy control <NA> 18 0.0087209302
## 29 Infectious Europe &\nCentral Asia 208 0.6172106825
## 30 Infectious Latin America &\nCaribbean 2 0.0059347181
## 31 Infectious North America 127 0.3768545994
## 32 Integumentary East Asia &\nPacific 7 1.0000000000
## 33 Kidney East Asia &\nPacific 17 1.0000000000
## 34 Mental health East Asia &\nPacific 8 0.0172043011
## 35 Mental health North America 457 0.9827956989
## 36 Metabolic Europe &\nCentral Asia 16 0.0683760684
## 37 Metabolic North America 218 0.9316239316
## 38 Neurodegenerative East Asia &\nPacific 20 0.1092896175
## 39 Neurodegenerative Europe &\nCentral Asia 19 0.1038251366
## 40 Neurodegenerative Latin America &\nCaribbean 15 0.0819672131
## 41 Neurodegenerative North America 129 0.7049180328
## 42 Neurological East Asia &\nPacific 6 0.4285714286
## 43 Neurological North America 8 0.5714285714
## 44 Other East Asia &\nPacific 9 1.0000000000
## 45 Reproductive East Asia &\nPacific 7 0.4117647059
## 46 Reproductive South Asia 10 0.5882352941
## 47 Respiratory East Asia &\nPacific 25 0.1968503937
## 48 Respiratory Europe &\nCentral Asia 48 0.3779527559
## 49 Respiratory North America 54 0.4251968504
## 50 <NA> East Asia &\nPacific 1034 0.0583554377
## 51 <NA> Europe &\nCentral Asia 4532 0.2557706417
## 52 <NA> Latin America &\nCaribbean 15 0.0008465489
## 53 <NA> Middle East &\nNorth Africa 11 0.0006208025
## 54 <NA> North America 12107 0.6832778373
## 55 <NA> South Asia 12 0.0006772391
## 56 <NA> <NA> 8 0.0004514927
allSRAFinal %>% count(finalDisease, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(finalDisease) %>% summarise(geoGroups = length(finalDisease)) %>% arrange(desc(geoGroups)) %>% as.data.frame
## finalDisease geoGroups
## 1 <NA> 11
## 2 Healthy control 8
## 3 Autoimmune 7
## 4 Cancer 5
## 5 Infectious 5
## 6 Respiratory 4
## 7 Cardiovascular 3
## 8 Neurodegenerative 3
## 9 Acute trauma 2
## 10 Blood 2
## 11 Gastrointestinal 2
## 12 Mental health 2
## 13 Metabolic 2
## 14 Endocrine 1
## 15 Genetic syndrome 1
## 16 Integumentary 1
## 17 Kidney 1
## 18 Neurological 1
## 19 Other 1
## 20 Reproductive 1
allSRAFinal %>% count(finalDisease, strictestRace) %>% drop_na(strictestRace) %>% group_by(finalDisease) %>% summarise(raceGroups = length(finalDisease)) %>% arrange(desc(raceGroups)) %>% as.data.frame
## finalDisease raceGroups
## 1 <NA> 8
## 2 Cancer 7
## 3 Autoimmune 6
## 4 Healthy control 6
## 5 Metabolic 4
## 6 Infectious 3
## 7 Mental health 3
## 8 Neurological 3
## 9 Blood 2
## 10 Neurodegenerative 2
## 11 Cardiovascular 1
## 12 Other 1
## 13 Reproductive 1
## 14 Respiratory 1
# And how many tissues is each descriptor associated with?
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(finalDisease)
## finalDisease n
## 1 Acute trauma 12
## 2 Autoimmune 674
## 3 Blood 24
## 4 Cancer 551
## 5 Cardiovascular 44
## 6 Endocrine 10
## 7 Gastrointestinal 7
## 8 Genetic syndrome 57
## 9 Healthy control 1134
## 10 Infectious 276
## 11 Integumentary 7
## 12 Kidney 17
## 13 Mental health 28
## 14 Metabolic 7
## 15 Neurodegenerative 72
## 16 Neurological 6
## 17 Other 7
## 18 Reproductive 10
## 19 Respiratory 107
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(finalDisease)
## finalDisease n
## 1 Autoimmune 309
## 2 Blood 5
## 3 Cancer 981
## 4 Cardiovascular 13
## 5 Healthy control 930
## 6 Infectious 61
## 7 Mental health 437
## 8 Metabolic 227
## 9 Neurodegenerative 111
## 10 Neurological 8
## 11 Other 2
## 12 Reproductive 7
## 13 Respiratory 20
allSRAFinal %>% count(finalDisease, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(strictestGeography) %>% summarise(geoGroups = length(strictestGeography)) %>% arrange(desc(geoGroups)) %>% as.data.frame
## strictestGeography geoGroups
## 1 Europe 13
## 2 East Asia 11
## 3 Sub-Saharan Africa 8
## 4 Americas 7
## 5 South Asia 6
## 6 Asia (NOS) 5
## 7 Multiple 4
## 8 Other 4
## 9 North Africa and\nWestern Asia 3
## 10 Southeast Asia 1
## 11 Oceania 1
allSRAFinal %>% count(finalDisease, strictestRace) %>% drop_na(strictestRace) %>% group_by(strictestRace) %>% summarise(raceGroups = length(strictestRace)) %>% arrange(desc(raceGroups)) %>% as.data.frame
## strictestRace raceGroups
## 1 Hispanic 10
## 2 White 10
## 3 Black or\nAfrican American 9
## 4 Asian 8
## 5 Other 4
## 6 Multiple 3
## 7 American Indian and\nAlaska Native 2
## 8 Native Hawaiian and\nother Pacific Islander 2
# And now some alluvial plots, for funsies and supplementary data:
geoDiseaseFacet <- geographySummary %>% group_by(finalDisease, strictestGeography, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestGeography, worldRegion)) %>% as.data.frame %>%
ggplot(., aes(x = finalDisease, y = value, fill = strictestGeography)) +
geom_bar(stat="identity") +
ggtitle("Samples with geographic/ancestry labels deposited in:") +
xlab("Sampled tissue") +
ylab("Samples") +
scale_fill_geography(name="Population\nDescriptor") +
guides(fill=guide_legend(title="", nrow=2)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
theme(strip.background = element_blank()) +
facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalDisease', 'strictestGeography'. You can override using the `.groups` argument.
geoDiseaseFacet
ggsave("fig4_disease_by_geography_faceted.pdf", width=6, height=9)
ggsave("fig4_disease_by_geography_faceted.png", width=6, height=9)
raceDiseaseFacet <- raceSummary %>% group_by(finalDisease, strictestRace, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestRace, worldRegion)) %>% as.data.frame %>%
ggplot(., aes(x = finalDisease, y = value, fill = strictestRace)) +
geom_bar(stat="identity") +
ggtitle("Samples with US Census labels deposited in:") +
xlab("Sampled tissue") +
ylab("Samples") +
scale_fill_race(name="US Census\nTerm)") +
guides(fill=guide_legend(title="")) +
theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
theme(strip.background = element_blank()) +
facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalDisease', 'strictestRace'. You can override using the `.groups` argument.
raceDiseaseFacet
ggsave("fig4_disease_by_race_faceted.pdf", width=6, height=9)
ggsave("fig4_disease_by_race_faceted.png", width=6, height=9)
What we really want to get to is the relationship between tissue and disease, because we think it might be interesting.
# Let's collapse tissues with under 100 observations and diseases with under 10 into "Other"
geoTissuesToKeep <- allSRAFinal %>% drop_na(strictestGeography) %>% count(finalOrgan) %>% filter(n >= 100)
raceTissuesToKeep <- allSRAFinal %>% drop_na(strictestRace) %>% count(finalOrgan) %>% filter(n >= 100)
geoDiseasesToKeep <- allSRAFinal %>% drop_na(strictestGeography) %>% count(finalDisease) %>% filter(n >= 10)
raceDiseasesToKeep <- allSRAFinal %>% drop_na(strictestRace) %>% count(finalDisease) %>% filter(n >= 10)
geoDisAlluvial <- allSRAFinal %>% drop_na(c(strictestGeography, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestGeography, finalOrgan, finalDisease) %>%
filter(grepl(paste(geoTissuesToKeep$finalOrgan, collapse="|"), finalOrgan)) %>%
filter(grepl(paste(geoDiseasesToKeep$finalDisease, collapse="|"), finalDisease)) %>%
ggplot(data = .,
aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
scale_x_discrete(limits = c("SRA Submitter\nWBER", "Disease class", "Tissue sequenced"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = strictestGeography)) +
scale_fill_geography() +
geom_stratum(width=1/4) +
ylab("Samples") +
# geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
theme_minimal(base_size = 6) +
guides(fill=guide_legend(title="Descriptor", nrow=2)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")
raceDisAlluvial <- allSRAFinal %>% drop_na(c(strictestRace, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestRace, finalOrgan, finalDisease) %>%
filter(grepl(paste(raceTissuesToKeep$finalOrgan, collapse="|"), finalOrgan)) %>%
filter(grepl(paste(raceDiseasesToKeep$finalDisease, collapse="|"), finalDisease)) %>%
ggplot(data = .,
aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
scale_x_discrete(limits = c("SRA Submitter\nWBER", "Tissue sequenced", "Disease class"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = strictestRace)) +
scale_fill_race() +
geom_stratum(width=1/4) +
ylab("Samples") +
# geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
theme_minimal(base_size = 6) +
guides(fill=guide_legend(title="Descriptor", nrow=2)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")
geoDisAlluvial + raceDisAlluvial +
plot_layout(design=long2Design) +
plot_annotation(tag_levels = 'A')
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
ggsave("fig4_organ_disease_alluvial.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
geoDisAlluvial <- allSRAFinal %>% drop_na(c(strictestGeography, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestGeography, finalOrgan, finalDisease) %>%
ggplot(data = .,
aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
scale_x_discrete(limits = c("SRA Submitter\nWBER", "Disease class", "Tissue sequenced"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = strictestGeography)) +
scale_fill_geography() +
geom_stratum(width=1/4) +
ylab("Samples") +
# geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
theme_minimal(base_size = 6) +
guides(fill=guide_legend(title="Descriptor", nrow=2)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")
geoDisAlluvial
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
ggsave("fig4_organ_disease_geo_alluvial_full.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
ggsave("fig4_organ_disease_geo_alluvial_full.png", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
raceDisAlluvial <- allSRAFinal %>% drop_na(c(strictestRace, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestRace, finalOrgan, finalDisease) %>%
ggplot(data = .,
aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
scale_x_discrete(limits = c("SRA Submitter\nWBER", "Tissue sequenced", "Disease class"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = strictestRace)) +
scale_fill_race() +
geom_stratum(width=1/4) +
ylab("Samples") +
# geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
theme_minimal(base_size = 6) +
guides(fill=guide_legend(title="Descriptor", nrow=2)) +
theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")
raceDisAlluvial
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggsave("fig4_organ_disease_race_alluvial_full.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
ggsave("fig4_organ_disease_race_alluvial_full.png", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
geoDisAlluvial + raceDisAlluvial +
plot_layout(design=long2Design) +
plot_annotation(tag_levels = 'A')
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggsave("fig4_organ_disease_alluvial_full.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
And finally… the numbers that underlie these plots, because otherwise I’ll go crazy trying to tabulate things:
allSRAFinal <- allSRAFinal %>%
mutate(hasDisease = if_else(is.na(finalDisease), "No", "Yes"))
# Who has disease info?
allSRAFinal %>% count(hasDisease) %>% mutate(freq = n/sum(n))
## hasDisease n freq
## 1 No 17719 0.7420017
## 2 Yes 6161 0.2579983
allSRAFinal %>% drop_na(strictestGeography) %>% count(hasDisease) %>% mutate(freq = n/sum(n))
## hasDisease n freq
## 1 No 6775 0.6895674
## 2 Yes 3050 0.3104326
allSRAFinal %>% drop_na(strictestRace) %>% count(hasDisease) %>% mutate(freq = n/sum(n))
## hasDisease n freq
## 1 No 10944 0.7786553
## 2 Yes 3111 0.2213447
# Some of these could work better as plots:
ggplot(allSRAFinal, aes(x = finalOrgan, fill = hasDisease)) +
geom_bar(position = "fill") +
ggtitle("") +
xlab("Sequenced tissue") +
ylab("Proportion of samples") +
coord_fixed(ratio=6) +
guides(fill=guide_legend(title="Disease info?")) +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5))
allSRAFinal %>% drop_na(finalOrgan) %>% count(hasDisease) %>% mutate(freq = n/sum(n))
## hasDisease n freq
## 1 No 17707 0.7443043
## 2 Yes 6083 0.2556957
allSRAFinal %>% drop_na(finalOrgan) %>% count(finalOrgan, hasDisease) %>% group_by(finalOrgan) %>% mutate(freq = n/sum(n)) %>% ungroup() %>% as.data.frame()
## finalOrgan hasDisease n freq
## 1 Adipose No 104 0.971962617
## 2 Adipose Yes 3 0.028037383
## 3 Adrenal gland Yes 3 1.000000000
## 4 Bladder No 1 0.083333333
## 5 Bladder Yes 11 0.916666667
## 6 Blastoderm No 22 1.000000000
## 7 Blood No 12489 0.890100492
## 8 Blood Yes 1542 0.109899508
## 9 Blood vessel No 102 0.539682540
## 10 Blood vessel Yes 87 0.460317460
## 11 Bone No 4 1.000000000
## 12 Bone marrow No 62 0.402597403
## 13 Bone marrow Yes 92 0.597402597
## 14 Brain No 969 0.571344340
## 15 Brain Yes 727 0.428655660
## 16 Breast No 353 0.704590818
## 17 Breast Yes 148 0.295409182
## 18 CNS No 8 0.533333333
## 19 CNS Yes 7 0.466666667
## 20 Cancer sample\n(NOS) Yes 737 1.000000000
## 21 Cartilage No 3 1.000000000
## 22 Digestive tract Yes 2 1.000000000
## 23 Eye No 47 1.000000000
## 24 Heart No 752 0.971576227
## 25 Heart Yes 22 0.028423773
## 26 IPSC No 541 0.333744602
## 27 IPSC Yes 1080 0.666255398
## 28 Intestine No 263 0.253861004
## 29 Intestine Yes 773 0.746138996
## 30 Joint Yes 92 1.000000000
## 31 Kidney No 25 0.925925926
## 32 Kidney Yes 2 0.074074074
## 33 Larynx Yes 1 1.000000000
## 34 Liver No 63 0.205211726
## 35 Liver Yes 244 0.794788274
## 36 Lung No 282 0.844311377
## 37 Lung Yes 52 0.155688623
## 38 Lymph node No 20 0.909090909
## 39 Lymph node Yes 2 0.090909091
## 40 Morula No 41 1.000000000
## 41 Muscle No 803 0.976885645
## 42 Muscle Yes 19 0.023114355
## 43 Nose No 79 0.692982456
## 44 Nose Yes 35 0.307017544
## 45 Oral cavity No 66 0.795180723
## 46 Oral cavity Yes 17 0.204819277
## 47 Ovary No 221 0.995495495
## 48 Ovary Yes 1 0.004504505
## 49 PNS No 12 1.000000000
## 50 Pancreas Yes 2 1.000000000
## 51 Pituitary gland No 7 1.000000000
## 52 Placenta No 32 1.000000000
## 53 Prostate No 65 0.314009662
## 54 Prostate Yes 142 0.685990338
## 55 Skin No 155 0.563636364
## 56 Skin Yes 120 0.436363636
## 57 Spleen Yes 3 1.000000000
## 58 Stomach No 1 0.022222222
## 59 Stomach Yes 44 0.977777778
## 60 Testis No 1 0.062500000
## 61 Testis Yes 15 0.937500000
## 62 Thymus Yes 1 1.000000000
## 63 Thyroid No 6 0.171428571
## 64 Thyroid Yes 29 0.828571429
## 65 Tonsil Yes 6 1.000000000
## 66 Trachea Yes 12 1.000000000
## 67 Urinary tract No 52 1.000000000
## 68 Uterus No 40 0.800000000
## 69 Uterus Yes 10 0.200000000
## 70 Vagina No 16 1.000000000
# Increasing levels of disease and geography granularity:
allSRAFinal %>% drop_na(strictestGeography) %>% count(strictestGeography, hasDisease) %>% mutate(freq = n/sum(n))
## strictestGeography hasDisease n freq
## 1 Sub-Saharan Africa No 1288 0.1310941476
## 2 Sub-Saharan Africa Yes 406 0.0413231552
## 3 North Africa and\nWestern Asia No 15 0.0015267176
## 4 North Africa and\nWestern Asia Yes 3 0.0003053435
## 5 Europe No 3735 0.3801526718
## 6 Europe Yes 1734 0.1764885496
## 7 South Asia No 688 0.0700254453
## 8 South Asia Yes 30 0.0030534351
## 9 Southeast Asia No 49 0.0049872774
## 10 East Asia No 683 0.0695165394
## 11 East Asia Yes 438 0.0445801527
## 12 Asia (NOS) No 173 0.0176081425
## 13 Asia (NOS) Yes 137 0.0139440204
## 14 Oceania No 11 0.0011195929
## 15 Americas No 56 0.0056997455
## 16 Americas Yes 110 0.0111959288
## 17 Multiple No 27 0.0027480916
## 18 Multiple Yes 180 0.0183206107
## 19 Other No 50 0.0050890585
## 20 Other Yes 12 0.0012213740
allSRAFinal %>% drop_na(strictestRace) %>% count(strictestRace, hasDisease) %>% mutate(freq = n/sum(n))
## strictestRace hasDisease n freq
## 1 American Indian and\nAlaska Native No 40 2.845962e-03
## 2 American Indian and\nAlaska Native Yes 3 2.134472e-04
## 3 Asian No 718 5.108502e-02
## 4 Asian Yes 226 1.607969e-02
## 5 Black or\nAfrican American No 1395 9.925293e-02
## 6 Black or\nAfrican American Yes 347 2.468872e-02
## 7 Hispanic No 1058 7.527570e-02
## 8 Hispanic Yes 197 1.401636e-02
## 9 Multiple No 182 1.294913e-02
## 10 Multiple Yes 21 1.494130e-03
## 11 Native Hawaiian and\nother Pacific Islander No 5 3.557453e-04
## 12 Native Hawaiian and\nother Pacific Islander Yes 1 7.114906e-05
## 13 Other No 158 1.124155e-02
## 14 Other Yes 4 2.845962e-04
## 15 White No 7388 5.256492e-01
## 16 White Yes 2312 1.644966e-01
allSRAFinal %>% drop_na(worldRegion) %>% count(worldRegion, hasDisease) %>% mutate(freq = n/sum(n))
## worldRegion hasDisease n freq
## 1 East Asia &\nPacific No 1034 0.0433724832
## 2 East Asia &\nPacific Yes 780 0.0327181208
## 3 Europe &\nCentral Asia No 4532 0.1901006711
## 4 Europe &\nCentral Asia Yes 1425 0.0597734899
## 5 Latin America &\nCaribbean No 15 0.0006291946
## 6 Latin America &\nCaribbean Yes 69 0.0028942953
## 7 Middle East &\nNorth Africa No 11 0.0004614094
## 8 North America No 12107 0.5078439597
## 9 North America Yes 3841 0.1611157718
## 10 South Asia No 12 0.0005033557
## 11 South Asia Yes 14 0.0005872483
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(strictestGeography, finalDisease) %>% mutate(freq = n/sum(n))
## strictestGeography finalDisease n freq
## 1 Sub-Saharan Africa Autoimmune 164 0.0537704918
## 2 Sub-Saharan Africa Cancer 1 0.0003278689
## 3 Sub-Saharan Africa Healthy control 66 0.0216393443
## 4 Sub-Saharan Africa Infectious 155 0.0508196721
## 5 Sub-Saharan Africa Mental health 8 0.0026229508
## 6 Sub-Saharan Africa Metabolic 6 0.0019672131
## 7 Sub-Saharan Africa Respiratory 6 0.0019672131
## 8 North Africa and\nWestern Asia Healthy control 1 0.0003278689
## 9 North Africa and\nWestern Asia Respiratory 2 0.0006557377
## 10 Europe Acute trauma 4 0.0013114754
## 11 Europe Autoimmune 380 0.1245901639
## 12 Europe Cancer 196 0.0642622951
## 13 Europe Cardiovascular 15 0.0049180328
## 14 Europe Gastrointestinal 2 0.0006557377
## 15 Europe Genetic syndrome 57 0.0186885246
## 16 Europe Healthy control 866 0.2839344262
## 17 Europe Infectious 52 0.0170491803
## 18 Europe Mental health 20 0.0065573770
## 19 Europe Neurodegenerative 47 0.0154098361
## 20 Europe Neurological 6 0.0019672131
## 21 Europe Respiratory 89 0.0291803279
## 22 South Asia Autoimmune 9 0.0029508197
## 23 South Asia Cardiovascular 2 0.0006557377
## 24 South Asia Healthy control 2 0.0006557377
## 25 South Asia Integumentary 7 0.0022950820
## 26 South Asia Reproductive 10 0.0032786885
## 27 East Asia Acute trauma 8 0.0026229508
## 28 East Asia Autoimmune 62 0.0203278689
## 29 East Asia Blood 23 0.0075409836
## 30 East Asia Cancer 256 0.0839344262
## 31 East Asia Cardiovascular 27 0.0088524590
## 32 East Asia Gastrointestinal 5 0.0016393443
## 33 East Asia Healthy control 23 0.0075409836
## 34 East Asia Kidney 17 0.0055737705
## 35 East Asia Neurodegenerative 10 0.0032786885
## 36 East Asia Other 7 0.0022950820
## 37 Asia (NOS) Autoimmune 33 0.0108196721
## 38 Asia (NOS) Cancer 75 0.0245901639
## 39 Asia (NOS) Endocrine 10 0.0032786885
## 40 Asia (NOS) Healthy control 19 0.0062295082
## 41 Americas Autoimmune 5 0.0016393443
## 42 Americas Blood 1 0.0003278689
## 43 Americas Cancer 23 0.0075409836
## 44 Americas Healthy control 38 0.0124590164
## 45 Americas Infectious 28 0.0091803279
## 46 Americas Neurodegenerative 15 0.0049180328
## 47 Multiple Autoimmune 21 0.0068852459
## 48 Multiple Healthy control 119 0.0390163934
## 49 Multiple Infectious 40 0.0131147541
## 50 Other Infectious 1 0.0003278689
## 51 Other Metabolic 1 0.0003278689
## 52 Other Respiratory 10 0.0032786885
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(strictestRace, finalDisease) %>% mutate(freq = n/sum(n))
## strictestRace finalDisease n freq
## 1 American Indian and\nAlaska Native Cancer 3 0.0009643202
## 2 Asian Autoimmune 62 0.0199292832
## 3 Asian Blood 4 0.0012857602
## 4 Asian Cancer 65 0.0208936033
## 5 Asian Healthy control 67 0.0215364834
## 6 Asian Metabolic 1 0.0003214401
## 7 Asian Reproductive 7 0.0022500804
## 8 Asian Respiratory 20 0.0064288010
## 9 Black or\nAfrican American Autoimmune 10 0.0032144005
## 10 Black or\nAfrican American Blood 1 0.0003214401
## 11 Black or\nAfrican American Cancer 100 0.0321440051
## 12 Black or\nAfrican American Healthy control 136 0.0437158470
## 13 Black or\nAfrican American Infectious 25 0.0080360013
## 14 Black or\nAfrican American Mental health 70 0.0225008036
## 15 Black or\nAfrican American Metabolic 2 0.0006428801
## 16 Black or\nAfrican American Neurological 3 0.0009643202
## 17 Hispanic Autoimmune 62 0.0199292832
## 18 Hispanic Cancer 57 0.0183220829
## 19 Hispanic Healthy control 61 0.0196078431
## 20 Hispanic Infectious 4 0.0012857602
## 21 Hispanic Mental health 5 0.0016072003
## 22 Hispanic Metabolic 2 0.0006428801
## 23 Hispanic Neurodegenerative 3 0.0009643202
## 24 Hispanic Neurological 1 0.0003214401
## 25 Hispanic Other 2 0.0006428801
## 26 Multiple Cancer 10 0.0032144005
## 27 Multiple Healthy control 11 0.0035358406
## 28 Native Hawaiian and\nother Pacific Islander Autoimmune 1 0.0003214401
## 29 Other Autoimmune 1 0.0003214401
## 30 Other Cancer 1 0.0003214401
## 31 Other Healthy control 2 0.0006428801
## 32 White Autoimmune 173 0.0556091289
## 33 White Cancer 745 0.2394728383
## 34 White Cardiovascular 13 0.0041787207
## 35 White Healthy control 653 0.2099003536
## 36 White Infectious 32 0.0102860816
## 37 White Mental health 362 0.1163612986
## 38 White Metabolic 222 0.0713596914
## 39 White Neurodegenerative 108 0.0347155256
## 40 White Neurological 4 0.0012857602
allSRAFinal %>% drop_na(worldRegion, finalDisease) %>% count(worldRegion, finalDisease) %>% mutate(freq = n/sum(n))
## worldRegion finalDisease n freq
## 1 East Asia &\nPacific Acute trauma 12 0.0019579050
## 2 East Asia &\nPacific Autoimmune 98 0.0159895578
## 3 East Asia &\nPacific Blood 27 0.0044052863
## 4 East Asia &\nPacific Cancer 380 0.0620003263
## 5 East Asia &\nPacific Cardiovascular 27 0.0044052863
## 6 East Asia &\nPacific Endocrine 10 0.0016315875
## 7 East Asia &\nPacific Gastrointestinal 5 0.0008157938
## 8 East Asia &\nPacific Healthy control 122 0.0199053679
## 9 East Asia &\nPacific Integumentary 7 0.0011421113
## 10 East Asia &\nPacific Kidney 17 0.0027736988
## 11 East Asia &\nPacific Mental health 8 0.0013052700
## 12 East Asia &\nPacific Neurodegenerative 20 0.0032631751
## 13 East Asia &\nPacific Neurological 6 0.0009789525
## 14 East Asia &\nPacific Other 9 0.0014684288
## 15 East Asia &\nPacific Reproductive 7 0.0011421113
## 16 East Asia &\nPacific Respiratory 25 0.0040789688
## 17 Europe &\nCentral Asia Autoimmune 32 0.0052210801
## 18 Europe &\nCentral Asia Cancer 208 0.0339370207
## 19 Europe &\nCentral Asia Cardiovascular 15 0.0024473813
## 20 Europe &\nCentral Asia Genetic syndrome 45 0.0073421439
## 21 Europe &\nCentral Asia Healthy control 834 0.1360744004
## 22 Europe &\nCentral Asia Infectious 208 0.0339370207
## 23 Europe &\nCentral Asia Metabolic 16 0.0026105401
## 24 Europe &\nCentral Asia Neurodegenerative 19 0.0031000163
## 25 Europe &\nCentral Asia Respiratory 48 0.0078316202
## 26 Latin America &\nCaribbean Autoimmune 1 0.0001631588
## 27 Latin America &\nCaribbean Blood 1 0.0001631588
## 28 Latin America &\nCaribbean Cancer 23 0.0037526513
## 29 Latin America &\nCaribbean Healthy control 27 0.0044052863
## 30 Latin America &\nCaribbean Infectious 2 0.0003263175
## 31 Latin America &\nCaribbean Neurodegenerative 15 0.0024473813
## 32 North America Autoimmune 852 0.1390112580
## 33 North America Blood 1 0.0001631588
## 34 North America Cancer 907 0.1479849894
## 35 North America Cardiovascular 13 0.0021210638
## 36 North America Gastrointestinal 2 0.0003263175
## 37 North America Genetic syndrome 12 0.0019579050
## 38 North America Healthy control 1061 0.1731114374
## 39 North America Infectious 127 0.0207211617
## 40 North America Mental health 457 0.0745635503
## 41 North America Metabolic 218 0.0355686083
## 42 North America Neurodegenerative 129 0.0210474792
## 43 North America Neurological 8 0.0013052700
## 44 North America Respiratory 54 0.0088105727
## 45 South Asia Cardiovascular 2 0.0003263175
## 46 South Asia Healthy control 2 0.0003263175
## 47 South Asia Reproductive 10 0.0016315875
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(finalDisease, strictestGeography) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
## finalDisease strictestGeography n freq diseaseFreq
## 1 Acute trauma Europe 4 0.0013114754 0.3333333333
## 2 Acute trauma East Asia 8 0.0026229508 0.6666666667
## 3 Autoimmune Sub-Saharan Africa 164 0.0537704918 0.2433234421
## 4 Autoimmune Europe 380 0.1245901639 0.5637982196
## 5 Autoimmune South Asia 9 0.0029508197 0.0133531157
## 6 Autoimmune East Asia 62 0.0203278689 0.0919881306
## 7 Autoimmune Asia (NOS) 33 0.0108196721 0.0489614243
## 8 Autoimmune Americas 5 0.0016393443 0.0074183976
## 9 Autoimmune Multiple 21 0.0068852459 0.0311572700
## 10 Blood East Asia 23 0.0075409836 0.9583333333
## 11 Blood Americas 1 0.0003278689 0.0416666667
## 12 Cancer Sub-Saharan Africa 1 0.0003278689 0.0018148820
## 13 Cancer Europe 196 0.0642622951 0.3557168784
## 14 Cancer East Asia 256 0.0839344262 0.4646098004
## 15 Cancer Asia (NOS) 75 0.0245901639 0.1361161525
## 16 Cancer Americas 23 0.0075409836 0.0417422868
## 17 Cardiovascular Europe 15 0.0049180328 0.3409090909
## 18 Cardiovascular South Asia 2 0.0006557377 0.0454545455
## 19 Cardiovascular East Asia 27 0.0088524590 0.6136363636
## 20 Endocrine Asia (NOS) 10 0.0032786885 1.0000000000
## 21 Gastrointestinal Europe 2 0.0006557377 0.2857142857
## 22 Gastrointestinal East Asia 5 0.0016393443 0.7142857143
## 23 Genetic syndrome Europe 57 0.0186885246 1.0000000000
## 24 Healthy control Sub-Saharan Africa 66 0.0216393443 0.0582010582
## 25 Healthy control North Africa and\nWestern Asia 1 0.0003278689 0.0008818342
## 26 Healthy control Europe 866 0.2839344262 0.7636684303
## 27 Healthy control South Asia 2 0.0006557377 0.0017636684
## 28 Healthy control East Asia 23 0.0075409836 0.0202821869
## 29 Healthy control Asia (NOS) 19 0.0062295082 0.0167548501
## 30 Healthy control Americas 38 0.0124590164 0.0335097002
## 31 Healthy control Multiple 119 0.0390163934 0.1049382716
## 32 Infectious Sub-Saharan Africa 155 0.0508196721 0.5615942029
## 33 Infectious Europe 52 0.0170491803 0.1884057971
## 34 Infectious Americas 28 0.0091803279 0.1014492754
## 35 Infectious Multiple 40 0.0131147541 0.1449275362
## 36 Infectious Other 1 0.0003278689 0.0036231884
## 37 Integumentary South Asia 7 0.0022950820 1.0000000000
## 38 Kidney East Asia 17 0.0055737705 1.0000000000
## 39 Mental health Sub-Saharan Africa 8 0.0026229508 0.2857142857
## 40 Mental health Europe 20 0.0065573770 0.7142857143
## 41 Metabolic Sub-Saharan Africa 6 0.0019672131 0.8571428571
## 42 Metabolic Other 1 0.0003278689 0.1428571429
## 43 Neurodegenerative Europe 47 0.0154098361 0.6527777778
## 44 Neurodegenerative East Asia 10 0.0032786885 0.1388888889
## 45 Neurodegenerative Americas 15 0.0049180328 0.2083333333
## 46 Neurological Europe 6 0.0019672131 1.0000000000
## 47 Other East Asia 7 0.0022950820 1.0000000000
## 48 Reproductive South Asia 10 0.0032786885 1.0000000000
## 49 Respiratory Sub-Saharan Africa 6 0.0019672131 0.0560747664
## 50 Respiratory North Africa and\nWestern Asia 2 0.0006557377 0.0186915888
## 51 Respiratory Europe 89 0.0291803279 0.8317757009
## 52 Respiratory Other 10 0.0032786885 0.0934579439
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(finalDisease, strictestRace) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
## finalDisease strictestRace n freq diseaseFreq
## 1 Autoimmune Asian 62 0.0199292832 0.200647249
## 2 Autoimmune Black or\nAfrican American 10 0.0032144005 0.032362460
## 3 Autoimmune Hispanic 62 0.0199292832 0.200647249
## 4 Autoimmune Native Hawaiian and\nother Pacific Islander 1 0.0003214401 0.003236246
## 5 Autoimmune Other 1 0.0003214401 0.003236246
## 6 Autoimmune White 173 0.0556091289 0.559870550
## 7 Blood Asian 4 0.0012857602 0.800000000
## 8 Blood Black or\nAfrican American 1 0.0003214401 0.200000000
## 9 Cancer American Indian and\nAlaska Native 3 0.0009643202 0.003058104
## 10 Cancer Asian 65 0.0208936033 0.066258919
## 11 Cancer Black or\nAfrican American 100 0.0321440051 0.101936799
## 12 Cancer Hispanic 57 0.0183220829 0.058103976
## 13 Cancer Multiple 10 0.0032144005 0.010193680
## 14 Cancer Other 1 0.0003214401 0.001019368
## 15 Cancer White 745 0.2394728383 0.759429154
## 16 Cardiovascular White 13 0.0041787207 1.000000000
## 17 Healthy control Asian 67 0.0215364834 0.072043011
## 18 Healthy control Black or\nAfrican American 136 0.0437158470 0.146236559
## 19 Healthy control Hispanic 61 0.0196078431 0.065591398
## 20 Healthy control Multiple 11 0.0035358406 0.011827957
## 21 Healthy control Other 2 0.0006428801 0.002150538
## 22 Healthy control White 653 0.2099003536 0.702150538
## 23 Infectious Black or\nAfrican American 25 0.0080360013 0.409836066
## 24 Infectious Hispanic 4 0.0012857602 0.065573770
## 25 Infectious White 32 0.0102860816 0.524590164
## 26 Mental health Black or\nAfrican American 70 0.0225008036 0.160183066
## 27 Mental health Hispanic 5 0.0016072003 0.011441648
## 28 Mental health White 362 0.1163612986 0.828375286
## 29 Metabolic Asian 1 0.0003214401 0.004405286
## 30 Metabolic Black or\nAfrican American 2 0.0006428801 0.008810573
## 31 Metabolic Hispanic 2 0.0006428801 0.008810573
## 32 Metabolic White 222 0.0713596914 0.977973568
## 33 Neurodegenerative Hispanic 3 0.0009643202 0.027027027
## 34 Neurodegenerative White 108 0.0347155256 0.972972973
## 35 Neurological Black or\nAfrican American 3 0.0009643202 0.375000000
## 36 Neurological Hispanic 1 0.0003214401 0.125000000
## 37 Neurological White 4 0.0012857602 0.500000000
## 38 Other Hispanic 2 0.0006428801 1.000000000
## 39 Reproductive Asian 7 0.0022500804 1.000000000
## 40 Respiratory Asian 20 0.0064288010 1.000000000
allSRAFinal %>% drop_na(worldRegion, finalDisease) %>% count(finalDisease, worldRegion) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
## finalDisease worldRegion n freq diseaseFreq
## 1 Acute trauma East Asia &\nPacific 12 0.0019579050 1.0000000000
## 2 Autoimmune East Asia &\nPacific 98 0.0159895578 0.0996948118
## 3 Autoimmune Europe &\nCentral Asia 32 0.0052210801 0.0325534079
## 4 Autoimmune Latin America &\nCaribbean 1 0.0001631588 0.0010172940
## 5 Autoimmune North America 852 0.1390112580 0.8667344863
## 6 Blood East Asia &\nPacific 27 0.0044052863 0.9310344828
## 7 Blood Latin America &\nCaribbean 1 0.0001631588 0.0344827586
## 8 Blood North America 1 0.0001631588 0.0344827586
## 9 Cancer East Asia &\nPacific 380 0.0620003263 0.2503293808
## 10 Cancer Europe &\nCentral Asia 208 0.0339370207 0.1370223979
## 11 Cancer Latin America &\nCaribbean 23 0.0037526513 0.0151515152
## 12 Cancer North America 907 0.1479849894 0.5974967062
## 13 Cardiovascular East Asia &\nPacific 27 0.0044052863 0.4736842105
## 14 Cardiovascular Europe &\nCentral Asia 15 0.0024473813 0.2631578947
## 15 Cardiovascular North America 13 0.0021210638 0.2280701754
## 16 Cardiovascular South Asia 2 0.0003263175 0.0350877193
## 17 Endocrine East Asia &\nPacific 10 0.0016315875 1.0000000000
## 18 Gastrointestinal East Asia &\nPacific 5 0.0008157938 0.7142857143
## 19 Gastrointestinal North America 2 0.0003263175 0.2857142857
## 20 Genetic syndrome Europe &\nCentral Asia 45 0.0073421439 0.7894736842
## 21 Genetic syndrome North America 12 0.0019579050 0.2105263158
## 22 Healthy control East Asia &\nPacific 122 0.0199053679 0.0596285435
## 23 Healthy control Europe &\nCentral Asia 834 0.1360744004 0.4076246334
## 24 Healthy control Latin America &\nCaribbean 27 0.0044052863 0.0131964809
## 25 Healthy control North America 1061 0.1731114374 0.5185728250
## 26 Healthy control South Asia 2 0.0003263175 0.0009775171
## 27 Infectious Europe &\nCentral Asia 208 0.0339370207 0.6172106825
## 28 Infectious Latin America &\nCaribbean 2 0.0003263175 0.0059347181
## 29 Infectious North America 127 0.0207211617 0.3768545994
## 30 Integumentary East Asia &\nPacific 7 0.0011421113 1.0000000000
## 31 Kidney East Asia &\nPacific 17 0.0027736988 1.0000000000
## 32 Mental health East Asia &\nPacific 8 0.0013052700 0.0172043011
## 33 Mental health North America 457 0.0745635503 0.9827956989
## 34 Metabolic Europe &\nCentral Asia 16 0.0026105401 0.0683760684
## 35 Metabolic North America 218 0.0355686083 0.9316239316
## 36 Neurodegenerative East Asia &\nPacific 20 0.0032631751 0.1092896175
## 37 Neurodegenerative Europe &\nCentral Asia 19 0.0031000163 0.1038251366
## 38 Neurodegenerative Latin America &\nCaribbean 15 0.0024473813 0.0819672131
## 39 Neurodegenerative North America 129 0.0210474792 0.7049180328
## 40 Neurological East Asia &\nPacific 6 0.0009789525 0.4285714286
## 41 Neurological North America 8 0.0013052700 0.5714285714
## 42 Other East Asia &\nPacific 9 0.0014684288 1.0000000000
## 43 Reproductive East Asia &\nPacific 7 0.0011421113 0.4117647059
## 44 Reproductive South Asia 10 0.0016315875 0.5882352941
## 45 Respiratory East Asia &\nPacific 25 0.0040789688 0.1968503937
## 46 Respiratory Europe &\nCentral Asia 48 0.0078316202 0.3779527559
## 47 Respiratory North America 54 0.0088105727 0.4251968504
# And here comes the awfulness... disease by tissue and the rest:
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(finalDisease, finalOrgan, strictestGeography) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
## finalDisease finalOrgan strictestGeography n freq diseaseFreq
## 1 Acute trauma Blood vessel Europe 3 0.0009836066 0.2500000000
## 2 Acute trauma Blood vessel East Asia 8 0.0026229508 0.6666666667
## 3 Acute trauma Heart Europe 1 0.0003278689 0.0833333333
## 4 Autoimmune Blood Sub-Saharan Africa 35 0.0114754098 0.0519287834
## 5 Autoimmune Blood Europe 8 0.0026229508 0.0118694362
## 6 Autoimmune Bone marrow Americas 1 0.0003278689 0.0014836795
## 7 Autoimmune Intestine Sub-Saharan Africa 127 0.0416393443 0.1884272997
## 8 Autoimmune Intestine Europe 290 0.0950819672 0.4302670623
## 9 Autoimmune Intestine South Asia 5 0.0016393443 0.0074183976
## 10 Autoimmune Intestine Asia (NOS) 3 0.0009836066 0.0044510386
## 11 Autoimmune Intestine Multiple 21 0.0068852459 0.0311572700
## 12 Autoimmune Joint Sub-Saharan Africa 2 0.0006557377 0.0029673591
## 13 Autoimmune Joint Europe 50 0.0163934426 0.0741839763
## 14 Autoimmune Joint South Asia 4 0.0013114754 0.0059347181
## 15 Autoimmune Joint East Asia 2 0.0006557377 0.0029673591
## 16 Autoimmune Joint Asia (NOS) 30 0.0098360656 0.0445103858
## 17 Autoimmune Joint Americas 4 0.0013114754 0.0059347181
## 18 Autoimmune Skin Europe 18 0.0059016393 0.0267062315
## 19 Autoimmune Skin East Asia 60 0.0196721311 0.0890207715
## 20 Autoimmune Thyroid Europe 14 0.0045901639 0.0207715134
## 21 Blood Bone marrow East Asia 23 0.0075409836 0.9583333333
## 22 Blood Bone marrow Americas 1 0.0003278689 0.0416666667
## 23 Cancer Bladder East Asia 10 0.0032786885 0.0181488203
## 24 Cancer Blood vessel Europe 24 0.0078688525 0.0435571688
## 25 Cancer Bone marrow East Asia 20 0.0065573770 0.0362976407
## 26 Cancer Bone marrow Americas 23 0.0075409836 0.0417422868
## 27 Cancer Breast Europe 30 0.0098360656 0.0544464610
## 28 Cancer Cancer sample\n(NOS) Sub-Saharan Africa 1 0.0003278689 0.0018148820
## 29 Cancer Cancer sample\n(NOS) Europe 14 0.0045901639 0.0254083485
## 30 Cancer Cancer sample\n(NOS) Asia (NOS) 2 0.0006557377 0.0036297641
## 31 Cancer IPSC Asia (NOS) 39 0.0127868852 0.0707803993
## 32 Cancer Intestine Europe 4 0.0013114754 0.0072595281
## 33 Cancer Intestine East Asia 208 0.0681967213 0.3774954628
## 34 Cancer Liver East Asia 18 0.0059016393 0.0326678766
## 35 Cancer Liver Asia (NOS) 34 0.0111475410 0.0617059891
## 36 Cancer Lung Europe 1 0.0003278689 0.0018148820
## 37 Cancer Prostate Europe 94 0.0308196721 0.1705989111
## 38 Cancer Skin Europe 9 0.0029508197 0.0163339383
## 39 Cancer <NA> Europe 20 0.0065573770 0.0362976407
## 40 Cardiovascular Blood East Asia 6 0.0019672131 0.1363636364
## 41 Cardiovascular Blood vessel East Asia 21 0.0068852459 0.4772727273
## 42 Cardiovascular Heart Europe 15 0.0049180328 0.3409090909
## 43 Cardiovascular Testis South Asia 2 0.0006557377 0.0454545455
## 44 Endocrine Thyroid Asia (NOS) 10 0.0032786885 1.0000000000
## 45 Gastrointestinal Blood East Asia 5 0.0016393443 0.7142857143
## 46 Gastrointestinal Intestine Europe 2 0.0006557377 0.2857142857
## 47 Genetic syndrome Blood Europe 38 0.0124590164 0.6666666667
## 48 Genetic syndrome IPSC Europe 12 0.0039344262 0.2105263158
## 49 Genetic syndrome Nose Europe 7 0.0022950820 0.1228070175
## 50 Healthy control Blood Sub-Saharan Africa 27 0.0088524590 0.0238095238
## 51 Healthy control Blood Europe 73 0.0239344262 0.0643738977
## 52 Healthy control Blood East Asia 22 0.0072131148 0.0194003527
## 53 Healthy control Blood Asia (NOS) 1 0.0003278689 0.0008818342
## 54 Healthy control Blood Americas 11 0.0036065574 0.0097001764
## 55 Healthy control Blood Multiple 113 0.0370491803 0.0996472663
## 56 Healthy control Blood vessel Sub-Saharan Africa 8 0.0026229508 0.0070546737
## 57 Healthy control Blood vessel North Africa and\nWestern Asia 1 0.0003278689 0.0008818342
## 58 Healthy control Blood vessel Europe 12 0.0039344262 0.0105820106
## 59 Healthy control Blood vessel East Asia 1 0.0003278689 0.0008818342
## 60 Healthy control Bone marrow Europe 12 0.0039344262 0.0105820106
## 61 Healthy control Brain Sub-Saharan Africa 8 0.0026229508 0.0070546737
## 62 Healthy control Brain Europe 18 0.0059016393 0.0158730159
## 63 Healthy control IPSC Europe 610 0.2000000000 0.5379188713
## 64 Healthy control IPSC South Asia 2 0.0006557377 0.0017636684
## 65 Healthy control IPSC Asia (NOS) 15 0.0049180328 0.0132275132
## 66 Healthy control IPSC Americas 27 0.0088524590 0.0238095238
## 67 Healthy control Intestine Sub-Saharan Africa 23 0.0075409836 0.0202821869
## 68 Healthy control Intestine Europe 45 0.0147540984 0.0396825397
## 69 Healthy control Intestine Asia (NOS) 2 0.0006557377 0.0017636684
## 70 Healthy control Intestine Multiple 6 0.0019672131 0.0052910053
## 71 Healthy control Lung Europe 12 0.0039344262 0.0105820106
## 72 Healthy control Muscle Europe 16 0.0052459016 0.0141093474
## 73 Healthy control Nose Europe 28 0.0091803279 0.0246913580
## 74 Healthy control Skin Europe 11 0.0036065574 0.0097001764
## 75 Healthy control Testis Europe 2 0.0006557377 0.0017636684
## 76 Healthy control Testis Asia (NOS) 1 0.0003278689 0.0008818342
## 77 Healthy control Thyroid Europe 5 0.0016393443 0.0044091711
## 78 Healthy control Trachea Europe 12 0.0039344262 0.0105820106
## 79 Healthy control <NA> Europe 10 0.0032786885 0.0088183422
## 80 Infectious Blood Sub-Saharan Africa 151 0.0495081967 0.5471014493
## 81 Infectious Blood Europe 44 0.0144262295 0.1594202899
## 82 Infectious Blood Americas 26 0.0085245902 0.0942028986
## 83 Infectious Blood Multiple 40 0.0131147541 0.1449275362
## 84 Infectious Blood Other 1 0.0003278689 0.0036231884
## 85 Infectious Bone marrow Americas 2 0.0006557377 0.0072463768
## 86 Infectious Lung Sub-Saharan Africa 4 0.0013114754 0.0144927536
## 87 Infectious <NA> Europe 8 0.0026229508 0.0289855072
## 88 Integumentary Blood South Asia 7 0.0022950820 1.0000000000
## 89 Kidney Blood East Asia 17 0.0055737705 1.0000000000
## 90 Mental health Brain Sub-Saharan Africa 8 0.0026229508 0.2857142857
## 91 Mental health IPSC Europe 20 0.0065573770 0.7142857143
## 92 Metabolic Blood Sub-Saharan Africa 6 0.0019672131 0.8571428571
## 93 Metabolic Blood Other 1 0.0003278689 0.1428571429
## 94 Neurodegenerative Blood East Asia 10 0.0032786885 0.1388888889
## 95 Neurodegenerative Brain Europe 10 0.0032786885 0.1388888889
## 96 Neurodegenerative CNS Europe 7 0.0022950820 0.0972222222
## 97 Neurodegenerative IPSC Europe 12 0.0039344262 0.1666666667
## 98 Neurodegenerative IPSC Americas 15 0.0049180328 0.2083333333
## 99 Neurodegenerative <NA> Europe 18 0.0059016393 0.2500000000
## 100 Neurological Skin Europe 6 0.0019672131 1.0000000000
## 101 Other Blood East Asia 7 0.0022950820 1.0000000000
## 102 Reproductive Testis South Asia 10 0.0032786885 1.0000000000
## 103 Respiratory Blood Europe 74 0.0242622951 0.6915887850
## 104 Respiratory Blood Other 10 0.0032786885 0.0934579439
## 105 Respiratory Blood vessel North Africa and\nWestern Asia 2 0.0006557377 0.0186915888
## 106 Respiratory Blood vessel Europe 3 0.0009836066 0.0280373832
## 107 Respiratory Lung Sub-Saharan Africa 6 0.0019672131 0.0560747664
## 108 Respiratory Lung Europe 12 0.0039344262 0.1121495327
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(finalDisease, finalOrgan, strictestRace) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
## finalDisease finalOrgan strictestRace n freq diseaseFreq
## 1 Autoimmune Blood Asian 36 0.0115718419 0.116504854
## 2 Autoimmune Blood Black or\nAfrican American 10 0.0032144005 0.032362460
## 3 Autoimmune Blood Hispanic 59 0.0189649630 0.190938511
## 4 Autoimmune Blood Native Hawaiian and\nother Pacific Islander 1 0.0003214401 0.003236246
## 5 Autoimmune Blood Other 1 0.0003214401 0.003236246
## 6 Autoimmune Blood White 173 0.0556091289 0.559870550
## 7 Autoimmune Intestine Asian 26 0.0083574413 0.084142395
## 8 Autoimmune Intestine Hispanic 3 0.0009643202 0.009708738
## 9 Blood Blood Asian 4 0.0012857602 0.800000000
## 10 Blood Blood Black or\nAfrican American 1 0.0003214401 0.200000000
## 11 Cancer Blood White 3 0.0009643202 0.003058104
## 12 Cancer Bone marrow Asian 1 0.0003214401 0.001019368
## 13 Cancer Bone marrow Hispanic 1 0.0003214401 0.001019368
## 14 Cancer Bone marrow White 8 0.0025715204 0.008154944
## 15 Cancer Breast Asian 36 0.0115718419 0.036697248
## 16 Cancer Breast Black or\nAfrican American 40 0.0128576021 0.040774720
## 17 Cancer Breast Hispanic 1 0.0003214401 0.001019368
## 18 Cancer Breast White 41 0.0131790421 0.041794088
## 19 Cancer Cancer sample\n(NOS) American Indian and\nAlaska Native 3 0.0009643202 0.003058104
## 20 Cancer Cancer sample\n(NOS) Asian 28 0.0090003214 0.028542304
## 21 Cancer Cancer sample\n(NOS) Black or\nAfrican American 58 0.0186435230 0.059123344
## 22 Cancer Cancer sample\n(NOS) Hispanic 14 0.0045001607 0.014271152
## 23 Cancer Cancer sample\n(NOS) Multiple 10 0.0032144005 0.010193680
## 24 Cancer Cancer sample\n(NOS) Other 1 0.0003214401 0.001019368
## 25 Cancer Cancer sample\n(NOS) White 606 0.1947926712 0.617737003
## 26 Cancer Larynx White 1 0.0003214401 0.001019368
## 27 Cancer Lung White 9 0.0028929605 0.009174312
## 28 Cancer Lymph node White 2 0.0006428801 0.002038736
## 29 Cancer Oral cavity Black or\nAfrican American 1 0.0003214401 0.001019368
## 30 Cancer Oral cavity White 16 0.0051430408 0.016309888
## 31 Cancer Prostate White 48 0.0154291225 0.048929664
## 32 Cancer Skin White 6 0.0019286403 0.006116208
## 33 Cancer Stomach Hispanic 41 0.0131790421 0.041794088
## 34 Cancer Tonsil Black or\nAfrican American 1 0.0003214401 0.001019368
## 35 Cancer Tonsil White 5 0.0016072003 0.005096840
## 36 Cardiovascular Blood White 12 0.0038572806 0.923076923
## 37 Cardiovascular Liver White 1 0.0003214401 0.076923077
## 38 Healthy control Adipose Multiple 1 0.0003214401 0.001075269
## 39 Healthy control Bladder Multiple 1 0.0003214401 0.001075269
## 40 Healthy control Blood Asian 64 0.0205721633 0.068817204
## 41 Healthy control Blood Black or\nAfrican American 48 0.0154291225 0.051612903
## 42 Healthy control Blood Hispanic 32 0.0102860816 0.034408602
## 43 Healthy control Blood Other 2 0.0006428801 0.002150538
## 44 Healthy control Blood White 282 0.0906460945 0.303225806
## 45 Healthy control Brain Black or\nAfrican American 86 0.0276438444 0.092473118
## 46 Healthy control Brain Hispanic 19 0.0061073610 0.020430108
## 47 Healthy control Brain White 186 0.0597878496 0.200000000
## 48 Healthy control Heart Multiple 2 0.0006428801 0.002150538
## 49 Healthy control Heart White 1 0.0003214401 0.001075269
## 50 Healthy control IPSC White 169 0.0543233687 0.181720430
## 51 Healthy control Intestine Hispanic 2 0.0006428801 0.002150538
## 52 Healthy control Intestine Multiple 2 0.0006428801 0.002150538
## 53 Healthy control Lung Hispanic 6 0.0019286403 0.006451613
## 54 Healthy control Lung Multiple 1 0.0003214401 0.001075269
## 55 Healthy control Muscle Multiple 1 0.0003214401 0.001075269
## 56 Healthy control Skin White 1 0.0003214401 0.001075269
## 57 Healthy control Spleen Multiple 1 0.0003214401 0.001075269
## 58 Healthy control Stomach Multiple 1 0.0003214401 0.001075269
## 59 Healthy control Thymus Multiple 1 0.0003214401 0.001075269
## 60 Healthy control Uterus Asian 3 0.0009643202 0.003225806
## 61 Healthy control <NA> Black or\nAfrican American 2 0.0006428801 0.002150538
## 62 Healthy control <NA> Hispanic 2 0.0006428801 0.002150538
## 63 Healthy control <NA> White 14 0.0045001607 0.015053763
## 64 Infectious Blood Black or\nAfrican American 25 0.0080360013 0.409836066
## 65 Infectious Blood Hispanic 4 0.0012857602 0.065573770
## 66 Infectious Blood White 32 0.0102860816 0.524590164
## 67 Mental health Adipose White 2 0.0006428801 0.004576659
## 68 Mental health Adrenal gland White 2 0.0006428801 0.004576659
## 69 Mental health Blood vessel White 2 0.0006428801 0.004576659
## 70 Mental health Brain Black or\nAfrican American 70 0.0225008036 0.160183066
## 71 Mental health Brain Hispanic 5 0.0016072003 0.011441648
## 72 Mental health Brain White 198 0.0636451302 0.453089245
## 73 Mental health Digestive tract White 2 0.0006428801 0.004576659
## 74 Mental health Heart White 3 0.0009643202 0.006864989
## 75 Mental health IPSC White 135 0.0433944069 0.308924485
## 76 Mental health Intestine White 4 0.0012857602 0.009153318
## 77 Mental health Lung White 1 0.0003214401 0.002288330
## 78 Mental health Muscle White 2 0.0006428801 0.004576659
## 79 Mental health Ovary White 1 0.0003214401 0.002288330
## 80 Mental health Pancreas White 2 0.0006428801 0.004576659
## 81 Mental health Spleen White 2 0.0006428801 0.004576659
## 82 Mental health Stomach White 2 0.0006428801 0.004576659
## 83 Mental health <NA> White 4 0.0012857602 0.009153318
## 84 Metabolic Adrenal gland White 1 0.0003214401 0.004405286
## 85 Metabolic IPSC Hispanic 2 0.0006428801 0.008810573
## 86 Metabolic IPSC White 22 0.0070716811 0.096916300
## 87 Metabolic Kidney White 2 0.0006428801 0.008810573
## 88 Metabolic Liver White 191 0.0613950498 0.841409692
## 89 Metabolic Skin Asian 1 0.0003214401 0.004405286
## 90 Metabolic Skin Black or\nAfrican American 2 0.0006428801 0.008810573
## 91 Metabolic Skin White 6 0.0019286403 0.026431718
## 92 Neurodegenerative Brain Hispanic 3 0.0009643202 0.027027027
## 93 Neurodegenerative Brain White 108 0.0347155256 0.972972973
## 94 Neurological Brain Black or\nAfrican American 3 0.0009643202 0.375000000
## 95 Neurological Brain Hispanic 1 0.0003214401 0.125000000
## 96 Neurological Brain White 4 0.0012857602 0.500000000
## 97 Other Blood vessel Hispanic 2 0.0006428801 1.000000000
## 98 Reproductive Uterus Asian 7 0.0022500804 1.000000000
## 99 Respiratory Blood Asian 20 0.0064288010 1.000000000
allSRAFinal %>% drop_na(worldRegion, finalDisease) %>% count(finalDisease, finalOrgan, worldRegion) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
## finalDisease finalOrgan worldRegion n freq diseaseFreq
## 1 Acute trauma Blood vessel East Asia &\nPacific 11 0.0017947463 0.9166666667
## 2 Acute trauma Heart East Asia &\nPacific 1 0.0001631588 0.0833333333
## 3 Autoimmune Blood East Asia &\nPacific 12 0.0019579050 0.0122075280
## 4 Autoimmune Blood North America 311 0.0507423723 0.3163784334
## 5 Autoimmune Bone marrow Latin America &\nCaribbean 1 0.0001631588 0.0010172940
## 6 Autoimmune Intestine East Asia &\nPacific 26 0.0042421276 0.0264496439
## 7 Autoimmune Intestine North America 449 0.0732582803 0.4567650051
## 8 Autoimmune Joint North America 92 0.0150106053 0.0935910478
## 9 Autoimmune Skin East Asia &\nPacific 60 0.0097895252 0.0610376399
## 10 Autoimmune Skin Europe &\nCentral Asia 18 0.0029368576 0.0183112920
## 11 Autoimmune Thyroid Europe &\nCentral Asia 14 0.0022842225 0.0142421160
## 12 Blood Blood East Asia &\nPacific 4 0.0006526350 0.1379310345
## 13 Blood Blood North America 1 0.0001631588 0.0344827586
## 14 Blood Bone marrow East Asia &\nPacific 23 0.0037526513 0.7931034483
## 15 Blood Bone marrow Latin America &\nCaribbean 1 0.0001631588 0.0344827586
## 16 Cancer Bladder East Asia &\nPacific 10 0.0016315875 0.0065876153
## 17 Cancer Blood North America 3 0.0004894763 0.0019762846
## 18 Cancer Blood vessel Europe &\nCentral Asia 24 0.0039158101 0.0158102767
## 19 Cancer Bone marrow East Asia &\nPacific 20 0.0032631751 0.0131752306
## 20 Cancer Bone marrow Latin America &\nCaribbean 23 0.0037526513 0.0151515152
## 21 Cancer Bone marrow North America 10 0.0016315875 0.0065876153
## 22 Cancer Breast East Asia &\nPacific 36 0.0058737151 0.0237154150
## 23 Cancer Breast Europe &\nCentral Asia 30 0.0048947626 0.0197628458
## 24 Cancer Breast North America 82 0.0133790178 0.0540184453
## 25 Cancer Cancer sample\n(NOS) East Asia &\nPacific 15 0.0024473813 0.0098814229
## 26 Cancer Cancer sample\n(NOS) North America 708 0.1155163975 0.4664031621
## 27 Cancer IPSC East Asia &\nPacific 39 0.0063631914 0.0256916996
## 28 Cancer Intestine East Asia &\nPacific 208 0.0339370207 0.1370223979
## 29 Cancer Intestine North America 4 0.0006526350 0.0026350461
## 30 Cancer Larynx North America 1 0.0001631588 0.0006587615
## 31 Cancer Liver East Asia &\nPacific 52 0.0084842552 0.0342555995
## 32 Cancer Lung Europe &\nCentral Asia 1 0.0001631588 0.0006587615
## 33 Cancer Lung North America 9 0.0014684288 0.0059288538
## 34 Cancer Lymph node North America 2 0.0003263175 0.0013175231
## 35 Cancer Oral cavity North America 17 0.0027736988 0.0111989460
## 36 Cancer Prostate Europe &\nCentral Asia 142 0.0231685430 0.0935441370
## 37 Cancer Skin Europe &\nCentral Asia 9 0.0014684288 0.0059288538
## 38 Cancer Skin North America 6 0.0009789525 0.0039525692
## 39 Cancer Stomach North America 41 0.0066895089 0.0270092227
## 40 Cancer Tonsil North America 6 0.0009789525 0.0039525692
## 41 Cancer <NA> Europe &\nCentral Asia 2 0.0003263175 0.0013175231
## 42 Cancer <NA> North America 18 0.0029368576 0.0118577075
## 43 Cardiovascular Blood East Asia &\nPacific 6 0.0009789525 0.1052631579
## 44 Cardiovascular Blood North America 12 0.0019579050 0.2105263158
## 45 Cardiovascular Blood vessel East Asia &\nPacific 21 0.0034263338 0.3684210526
## 46 Cardiovascular Heart Europe &\nCentral Asia 15 0.0024473813 0.2631578947
## 47 Cardiovascular Liver North America 1 0.0001631588 0.0175438596
## 48 Cardiovascular Testis South Asia 2 0.0003263175 0.0350877193
## 49 Endocrine Thyroid East Asia &\nPacific 10 0.0016315875 1.0000000000
## 50 Gastrointestinal Blood East Asia &\nPacific 5 0.0008157938 0.7142857143
## 51 Gastrointestinal Intestine North America 2 0.0003263175 0.2857142857
## 52 Genetic syndrome Blood Europe &\nCentral Asia 38 0.0062000326 0.6666666667
## 53 Genetic syndrome IPSC North America 12 0.0019579050 0.2105263158
## 54 Genetic syndrome Nose Europe &\nCentral Asia 7 0.0011421113 0.1228070175
## 55 Healthy control Adipose North America 1 0.0001631588 0.0004887586
## 56 Healthy control Bladder North America 1 0.0001631588 0.0004887586
## 57 Healthy control Blood East Asia &\nPacific 58 0.0094632077 0.0283479961
## 58 Healthy control Blood Europe &\nCentral Asia 190 0.0310001632 0.0928641251
## 59 Healthy control Blood North America 427 0.0696687877 0.2086999022
## 60 Healthy control Blood vessel East Asia &\nPacific 22 0.0035894926 0.0107526882
## 61 Healthy control Bone marrow North America 12 0.0019579050 0.0058651026
## 62 Healthy control Brain East Asia &\nPacific 10 0.0016315875 0.0048875855
## 63 Healthy control Brain Europe &\nCentral Asia 5 0.0008157938 0.0024437928
## 64 Healthy control Brain North America 302 0.0492739435 0.1476050831
## 65 Healthy control Heart North America 3 0.0004894763 0.0014662757
## 66 Healthy control IPSC East Asia &\nPacific 20 0.0032631751 0.0097751711
## 67 Healthy control IPSC Europe &\nCentral Asia 582 0.0949583945 0.2844574780
## 68 Healthy control IPSC Latin America &\nCaribbean 27 0.0044052863 0.0131964809
## 69 Healthy control IPSC North America 194 0.0316527982 0.0948191593
## 70 Healthy control Intestine North America 80 0.0130527003 0.0391006843
## 71 Healthy control Lung Europe &\nCentral Asia 18 0.0029368576 0.0087976540
## 72 Healthy control Lung North America 1 0.0001631588 0.0004887586
## 73 Healthy control Muscle Europe &\nCentral Asia 16 0.0026105401 0.0078201369
## 74 Healthy control Muscle North America 1 0.0001631588 0.0004887586
## 75 Healthy control Nose Europe &\nCentral Asia 10 0.0016315875 0.0048875855
## 76 Healthy control Skin East Asia &\nPacific 4 0.0006526350 0.0019550342
## 77 Healthy control Skin Europe &\nCentral Asia 8 0.0013052700 0.0039100684
## 78 Healthy control Spleen North America 1 0.0001631588 0.0004887586
## 79 Healthy control Stomach North America 1 0.0001631588 0.0004887586
## 80 Healthy control Testis East Asia &\nPacific 1 0.0001631588 0.0004887586
## 81 Healthy control Testis South Asia 2 0.0003263175 0.0009775171
## 82 Healthy control Thymus North America 1 0.0001631588 0.0004887586
## 83 Healthy control Thyroid Europe &\nCentral Asia 5 0.0008157938 0.0024437928
## 84 Healthy control Trachea North America 12 0.0019579050 0.0058651026
## 85 Healthy control Uterus East Asia &\nPacific 3 0.0004894763 0.0014662757
## 86 Healthy control <NA> East Asia &\nPacific 4 0.0006526350 0.0019550342
## 87 Healthy control <NA> North America 24 0.0039158101 0.0117302053
## 88 Infectious Blood Europe &\nCentral Asia 208 0.0339370207 0.6172106825
## 89 Infectious Blood North America 115 0.0187632566 0.3412462908
## 90 Infectious Bone marrow Latin America &\nCaribbean 2 0.0003263175 0.0059347181
## 91 Infectious Lung North America 4 0.0006526350 0.0118694362
## 92 Infectious <NA> North America 8 0.0013052700 0.0237388724
## 93 Integumentary Blood East Asia &\nPacific 7 0.0011421113 1.0000000000
## 94 Kidney Blood East Asia &\nPacific 17 0.0027736988 1.0000000000
## 95 Mental health Adipose North America 2 0.0003263175 0.0043010753
## 96 Mental health Adrenal gland North America 2 0.0003263175 0.0043010753
## 97 Mental health Blood vessel North America 2 0.0003263175 0.0043010753
## 98 Mental health Brain North America 281 0.0458476097 0.6043010753
## 99 Mental health Digestive tract North America 2 0.0003263175 0.0043010753
## 100 Mental health Heart North America 3 0.0004894763 0.0064516129
## 101 Mental health IPSC East Asia &\nPacific 8 0.0013052700 0.0172043011
## 102 Mental health IPSC North America 147 0.0239843368 0.3161290323
## 103 Mental health Intestine North America 4 0.0006526350 0.0086021505
## 104 Mental health Lung North America 1 0.0001631588 0.0021505376
## 105 Mental health Muscle North America 2 0.0003263175 0.0043010753
## 106 Mental health Ovary North America 1 0.0001631588 0.0021505376
## 107 Mental health Pancreas North America 2 0.0003263175 0.0043010753
## 108 Mental health Spleen North America 2 0.0003263175 0.0043010753
## 109 Mental health Stomach North America 2 0.0003263175 0.0043010753
## 110 Mental health <NA> North America 4 0.0006526350 0.0086021505
## 111 Metabolic Adrenal gland North America 1 0.0001631588 0.0042735043
## 112 Metabolic Blood Europe &\nCentral Asia 7 0.0011421113 0.0299145299
## 113 Metabolic IPSC North America 24 0.0039158101 0.1025641026
## 114 Metabolic Kidney North America 2 0.0003263175 0.0085470085
## 115 Metabolic Liver North America 191 0.0311633219 0.8162393162
## 116 Metabolic Skin Europe &\nCentral Asia 9 0.0014684288 0.0384615385
## 117 Neurodegenerative Blood East Asia &\nPacific 10 0.0016315875 0.0546448087
## 118 Neurodegenerative Brain East Asia &\nPacific 10 0.0016315875 0.0546448087
## 119 Neurodegenerative Brain Europe &\nCentral Asia 7 0.0011421113 0.0382513661
## 120 Neurodegenerative Brain North America 104 0.0169685104 0.5683060109
## 121 Neurodegenerative CNS North America 7 0.0011421113 0.0382513661
## 122 Neurodegenerative IPSC Europe &\nCentral Asia 12 0.0019579050 0.0655737705
## 123 Neurodegenerative IPSC Latin America &\nCaribbean 15 0.0024473813 0.0819672131
## 124 Neurodegenerative <NA> North America 18 0.0029368576 0.0983606557
## 125 Neurological Brain North America 8 0.0013052700 0.5714285714
## 126 Neurological Skin East Asia &\nPacific 6 0.0009789525 0.4285714286
## 127 Other Blood East Asia &\nPacific 7 0.0011421113 0.7777777778
## 128 Other Blood vessel East Asia &\nPacific 2 0.0003263175 0.2222222222
## 129 Reproductive Testis South Asia 10 0.0016315875 0.5882352941
## 130 Reproductive Uterus East Asia &\nPacific 7 0.0011421113 0.4117647059
## 131 Respiratory Blood East Asia &\nPacific 20 0.0032631751 0.1574803150
## 132 Respiratory Blood Europe &\nCentral Asia 30 0.0048947626 0.2362204724
## 133 Respiratory Blood North America 54 0.0088105727 0.4251968504
## 134 Respiratory Blood vessel East Asia &\nPacific 5 0.0008157938 0.0393700787
## 135 Respiratory Lung Europe &\nCentral Asia 18 0.0029368576 0.1417322835
And then we write the file out, for the joint plotting adventures…
saveRDS(allSRAFinal, file="20240901_allSRAFinal_for_plotting.rds")